# Ноутбук с моделью

## Импортируем необходимые пакеты

In [None]:
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import os
import sys

## Считываем данные в dataframe

In [None]:
df = pd.read_csv(os.path.expanduser('~/cat_price_predict/data/train/train_set.csv'))
df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,fuel,odometer,title_status,transmission,image_url,description,state,lat,long,posting_date,price_category
0,7308295377,https://chattanooga.craigslist.org/ctd/d/chatt...,chattanooga,https://chattanooga.craigslist.org,54990,2020.0,ram,2500 crew cab big horn,diesel,27442.0,clean,other,https://images.craigslist.org/00N0N_1xMPvfxRAI...,Carvana is the safer way to buy a car During t...,tn,35.06,-85.25,2021-04-17T12:30:50-0400,high
1,7316380095,https://newjersey.craigslist.org/ctd/d/carlsta...,north jersey,https://newjersey.craigslist.org,16942,2016.0,ford,explorer 4wd 4dr xlt,,60023.0,clean,automatic,https://images.craigslist.org/00x0x_26jl9F0cnL...,***Call Us for more information at: 201-635-14...,nj,40.821805,-74.061962,2021-05-03T15:40:21-0400,medium
2,7313733749,https://reno.craigslist.org/ctd/d/atlanta-2017...,reno / tahoe,https://reno.craigslist.org,35590,2017.0,volkswagen,golf r hatchback,gas,14048.0,clean,other,https://images.craigslist.org/00y0y_eeZjWeiSfb...,Carvana is the safer way to buy a car During t...,ca,33.779214,-84.411811,2021-04-28T03:52:20-0700,high
3,7308210929,https://fayetteville.craigslist.org/ctd/d/rale...,fayetteville,https://fayetteville.craigslist.org,14500,2013.0,toyota,rav4,gas,117291.0,clean,automatic,https://images.craigslist.org/00606_iGe5iXidib...,2013 Toyota RAV4 XLE 4dr SUV Offered by: R...,nc,35.715954,-78.655304,2021-04-17T10:08:57-0400,medium
4,7316474668,https://newyork.craigslist.org/lgi/cto/d/baldw...,new york city,https://newyork.craigslist.org,21800,2021.0,nissan,altima,gas,8000.0,clean,automatic,https://images.craigslist.org/00V0V_3pSOiPZ3Sd...,2021 Nissan Altima Sv with Only 8 K Miles Titl...,ny,40.6548,-73.6097,2021-05-03T18:32:06-0400,medium


## Подготовка данных

### Обработка пропусков в данных

In [None]:
# Удаляем ненужные/лишние колонки 

columns_to_drop = [
    'id',
    'url',
    'region',
    'region_url',
    'price',
    'manufacturer',
    'image_url',
    'description',
    'posting_date',
    'lat',
    'long'
]

df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
# Есть ли пропуски?

df.isnull().sum().sort_values()

state               0
price_category      0
year               36
odometer           42
transmission       45
fuel               63
model             128
title_status      166
dtype: int64

In [None]:
# Сохраним в переменную numerical имена всех числовых признаков нашего датасета
numerical = df.select_dtypes(include=['int64', 'float64']).columns

# Сохраним в переменную categorical имена всех категориальных признаков нашего датасета
categorical = df.select_dtypes(include=['object']).columns

# В категориальных фичах заменяем пропуски модой
for feat in categorical:
    df[feat].fillna(df[feat].mode()[0], inplace=True)
    
# В численных фичах заменяем пропуски медианой
for feat in numerical:
    df[feat].fillna(df[feat].median(), inplace=True)

# Убедимся, что пропущенных значений больше нет
df.isnull().sum()

year              0
model             0
fuel              0
odometer          0
title_status      0
transmission      0
state             0
price_category    0
dtype: int64

### Удаление выбросов

In [None]:
df['year'].describe()

count    10000.00000
mean      2010.93540
std          9.65541
min       1915.00000
25%       2008.00000
50%       2013.00000
75%       2017.00000
max       2022.00000
Name: year, dtype: float64

In [None]:
def calculate_outliers(data):
    q25 = data.quantile(0.25)
    q75 = data.quantile(0.75)
    iqr = q75 - q25
    boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
    
    return boundaries

boundaries = calculate_outliers(df['year'])
boundaries

(1994.5, 2030.5)

In [None]:
df.loc[df['year'] < boundaries[0], 'year'] = round(boundaries[0])
df.loc[df['year'] > boundaries[1], 'year'] = round(boundaries[1])

df['year'].describe()

count    10000.000000
mean      2011.674200
std          6.505727
min       1994.000000
25%       2008.000000
50%       2013.000000
75%       2017.000000
max       2022.000000
Name: year, dtype: float64

## Feature engineering


In [None]:
def short_model(x):
    if not pd.isna(x):
        return x.lower().split(' ')[0]
    else:
        return x

# Добавляем фичу "short_model" – это первое слово из колонки model
df.loc[:, 'short_model'] = df['model'].apply(short_model)

# Добавляем фичу "age_category" (категория возраста)
df.loc[:, 'age_category'] =  df['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))

In [None]:
df.shape

(10000, 10)

In [None]:
# Кодируем категориальные фичи с помощью OneHotEncoder

columns_to_encode = [
    'fuel', 
    'title_status', 
    'transmission', 
    'state', 
    'short_model', 
    'age_category'
]

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder.fit(df[columns_to_encode])

df.loc[:, encoder.get_feature_names()] = encoder.transform(df[columns_to_encode])
df.head()

Unnamed: 0,year,model,fuel,odometer,title_status,transmission,state,price_category,short_model,age_category,...,x4_yukon,x4_z,x4_z3,x4_z4,x4_zephyr,x4_zx2,x4_♿,x5_average,x5_new,x5_old
0,2020.0,2500 crew cab big horn,diesel,27442.0,clean,other,tn,high,2500,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2016.0,explorer 4wd 4dr xlt,gas,60023.0,clean,automatic,nj,medium,explorer,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2017.0,golf r hatchback,gas,14048.0,clean,other,ca,high,golf,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2013.0,rav4,gas,117291.0,clean,automatic,nc,medium,rav4,average,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2021.0,altima,gas,8000.0,clean,automatic,ny,medium,altima,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Масштабируем числовые фичи

scaler = StandardScaler()

df['odometer_std'] = scaler.fit_transform(df[['odometer']])

In [None]:
df.head()

Unnamed: 0,year,model,fuel,odometer,title_status,transmission,state,price_category,short_model,age_category,...,x4_z,x4_z3,x4_z4,x4_zephyr,x4_zx2,x4_♿,x5_average,x5_new,x5_old,odometer_std
0,2020.0,2500 crew cab big horn,diesel,27442.0,clean,other,tn,high,2500,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.044178
1,2016.0,explorer 4wd 4dr xlt,gas,60023.0,clean,automatic,nj,medium,explorer,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.527649
2,2017.0,golf r hatchback,gas,14048.0,clean,other,ca,high,golf,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.256522
3,2013.0,rav4,gas,117291.0,clean,automatic,nc,medium,rav4,average,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.380259
4,2021.0,altima,gas,8000.0,clean,automatic,ny,medium,altima,new,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.352405


In [None]:
columns_to_drop = [
    'year',
    'model',
    'fuel', 
    'odometer', 
    'title_status', 
    'transmission', 
    'state', 
    'short_model', 
    'age_category'
]

df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,price_category,x0_diesel,x0_electric,x0_gas,x0_hybrid,x0_other,x1_clean,x1_lien,x1_missing,x1_parts only,...,x4_z,x4_z3,x4_z4,x4_zephyr,x4_zx2,x4_♿,x5_average,x5_new,x5_old,odometer_std
0,high,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.044178
1,medium,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.527649
2,high,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.256522
3,medium,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.380259
4,medium,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.352405


## Моделирование

In [None]:
X = df.drop(['price_category'], axis=1)
y = df['price_category']

models = [
    LogisticRegression(solver='liblinear'),
    RandomForestClassifier(),
    SVC()
]

for m in models:
    score = cross_val_score(m, X, y, cv=4, scoring='accuracy')
    print(f'model: {type(m).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')

model: LogisticRegression, acc_mean: 0.7401, acc_std: 0.0076
model: RandomForestClassifier, acc_mean: 0.7559, acc_std: 0.0068
model: SVC, acc_mean: 0.7611, acc_std: 0.0073
