In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import datetime
import re

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor



# Задание

С помощью генерации признаков получить rmse меньше 5000 на тестовой части выборки.

Постарайтесь получить такой результат сначала с помощью **RandomForestRegressor**, а затем и с помощью **Ridge**.

Если с помощью признаков меньше 5000 не получается, можно попробовать улучшить результат с помощью обучения других моделей, подбора параметров и усреднения предсказаний разных моделей.

Для генерации признаков вам сможет помочь код из последнего семинара.

# предобработаем данные

In [2]:
data = pd.read_csv('MotorcycleData.csv', encoding='cp1252', sep=',', )
data.drop('Unnamed: 0', axis=1, inplace=True)

f = lambda x: re.sub('[,$]', '', x)
data.Price = data.Price.map(f).astype(int)
data.Mileage = data.Mileage.astype('str').map(f)
data[['Mileage', 'Feedback_Perc', 'Watch_Count']] = data[['Mileage', 'Feedback_Perc', 'Watch_Count']].apply(pd.to_numeric, errors = 'coerce')
data.Make = data.Make.astype('str').map(lambda x: re.sub('®', '', x)).str.upper()
data.Vehicle_Tile = data.Vehicle_Tile.fillna('NAN')

data.head(5)

Unnamed: 0,Condition,Condition_Desc,Price,Location,Model_Year,Mileage,Exterior_Color,Make,Warranty,Model,...,Vehicle_Title,OBO,Feedback_Perc,Watch_Count,N_Reviews,Seller_Status,Vehicle_Tile,Auction,Buy_Now,Bid_Count
0,Used,mint!!! very low miles,11412,"McHenry, Illinois, United States",2013.0,16000.0,Black,HARLEY-DAVIDSON,Unspecified,Touring,...,,False,8.1,,2427,Private Seller,Clear,True,False,28.0
1,Used,Perfect condition,17200,"Fort Recovery, Ohio, United States",2016.0,60.0,Black,HARLEY-DAVIDSON,Vehicle has an existing warranty,Touring,...,,False,100.0,17.0,657,Private Seller,Clear,True,True,0.0
2,Used,,3872,"Chicago, Illinois, United States",1970.0,25763.0,Silver/Blue,BMW,Vehicle does NOT have an existing warranty,R-Series,...,,False,100.0,,136,,Clear,True,False,26.0
3,Used,CLEAN TITLE READY TO RIDE HOME,6575,"Green Bay, Wisconsin, United States",2009.0,33142.0,Red,HARLEY-DAVIDSON,,Touring,...,,False,100.0,,2920,Dealer,Clear,True,False,11.0
4,Used,,10000,"West Bend, Wisconsin, United States",2012.0,17800.0,Blue,HARLEY-DAVIDSON,NO WARRANTY,Touring,...,,False,100.0,13.0,271,OWNER,Clear,True,True,0.0


# разобьем их на трейн и тест

часть выборки мы сохраним и будем использовать как тестовую выборку

In [11]:
target = 'Price'

skf = KFold(shuffle=True, random_state=0)
for itr, ite in skf.split(data):
    break

y = data[target].values
y_train = data.loc[itr, target].values
y_test = data.loc[ite, target].values
data.loc[ite, target] = np.nan

print(len(itr), len(ite))

4995 2498


# генерация признаков

Основные изменения в коде должны произойти в этом блоке.

**Краткое напоминание** :

0. Числовые признаки
    - для линейных моделей признаки стоит масштабировать
1. Категориальные признаки
    - для линейных моделей лучше сделать one-hot-encoding
    - для деревьев лучше сделать LabelEncoding
    - для категориальных признаков высокой размерности (>10) можно сделать кодирование средним значением таргета. Это облегчит работу и для деревьев, и для линейных моделей
2. Время и координаты
    - оба типа признаков обладают некоторым внутренним порядком (два момента времени можно сравнить, широту/долготу тоже)
    - оба типа признаков позволяют посчитать расстояния между объектами

In [4]:
xcols = [c for c in data if c != target]
print(xcols)

['Condition', 'Condition_Desc', 'Location', 'Model_Year', 'Mileage', 'Exterior_Color', 'Make', 'Warranty', 'Model', 'Sub_Model', 'Type', 'Vehicle_Title', 'OBO', 'Feedback_Perc', 'Watch_Count', 'N_Reviews', 'Seller_Status', 'Vehicle_Tile', 'Auction', 'Buy_Now', 'Bid_Count']


In [5]:
X = data[xcols] #.select_dtypes(['int', 'float']).fillna(0)
for c in xcols: 
    if X[c].dtype == "int" or X[c].dtype == "float": X[c] = X[c].fillna(0)
X[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Condition,Condition_Desc,Location,Model_Year,Mileage,Exterior_Color,Make,Warranty,Model,Sub_Model,...,Vehicle_Title,OBO,Feedback_Perc,Watch_Count,N_Reviews,Seller_Status,Vehicle_Tile,Auction,Buy_Now,Bid_Count
0,Used,mint!!! very low miles,"McHenry, Illinois, United States",2013.0,16000.0,Black,HARLEY-DAVIDSON,Unspecified,Touring,street glide custom,...,0.0,False,8.1,0.0,2427,Private Seller,Clear,True,False,28.0
1,Used,Perfect condition,"Fort Recovery, Ohio, United States",2016.0,60.0,Black,HARLEY-DAVIDSON,Vehicle has an existing warranty,Touring,Street Glide,...,0.0,False,100.0,17.0,657,Private Seller,Clear,True,True,0.0
2,Used,,"Chicago, Illinois, United States",1970.0,25763.0,Silver/Blue,BMW,Vehicle does NOT have an existing warranty,R-Series,R75/5 SWB,...,0.0,False,100.0,0.0,136,,Clear,True,False,26.0
3,Used,CLEAN TITLE READY TO RIDE HOME,"Green Bay, Wisconsin, United States",2009.0,33142.0,Red,HARLEY-DAVIDSON,,Touring,ultra classic,...,0.0,False,100.0,0.0,2920,Dealer,Clear,True,False,11.0
4,Used,,"West Bend, Wisconsin, United States",2012.0,17800.0,Blue,HARLEY-DAVIDSON,NO WARRANTY,Touring,STREET GLIDE,...,0.0,False,100.0,13.0,271,OWNER,Clear,True,True,0.0


In [6]:
X['Model_Year'] = X['Model_Year'].where(X['Model_Year']!=0, np.nan)
X['Vehicle_Life'] = datetime.datetime.now().year - X['Model_Year']
X['Vehicle_Life'] = X['Vehicle_Life'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
X.select_dtypes(['O']).apply(pd.Series.nunique)

Condition            2
Condition_Desc    1184
Location          2026
Exterior_Color     869
Make                54
Warranty            70
Model             1750
Sub_Model         1207
Type               230
Seller_Status       29
Vehicle_Tile         5
dtype: int64

In [19]:
feature_to_encode = 'Make'
mean_values = pd.DataFrame(pd.DataFrame({'F' : X['Make'], 'y' : y}).groupby('F').y.mean())

X.merge(mean_values, left_on='Make', right_on='F', how='inner')

KeyError: 'F'

In [8]:
le1 = LabelEncoder().fit(X['Condition'])
X['Condition_enc'] = le1.transform(X['Condition'])

le2 = LabelEncoder().fit(X['Make'])
X['Make_enc'] = le2.transform(X['Make'])

le3 = LabelEncoder().fit(X['Vehicle_Tile'])
X['Vehicle_Tile_enc'] = le3.transform(X['Vehicle_Tile'])

In [9]:
X.Seller_Status.unique()

array(['Private Seller', nan, 'Dealer', 'OWNER',
       'Belvidere Police Department', 'owner', 'Owner', 'Peter Root',
       'Carrigan Motor Group', 'By owner', 'SHORELINE HARLEY -  DAVIDSON',
       'JOHNNY RAY RICHLAND    TRIKE ON AMERICA', 'onwer', 'private',
       'original owners son', 'BikesforDutch', 'Pawnbroker', 'Ricky',
       'factory', 'manufacturer', 'First Owner', 'Private Owner',
       'Original Owner', 'Customs by Barry, 40+ years successful building',
       'Private Party Seller', 'wholesale', 'First owner', 'By Owner',
       '220000', 'Private owner'], dtype=object)

In [10]:
mask1 = X.Seller_Status.astype('str').isin(['Private Seller', 'OWNER', 'owner', 'Owner', 'By owner', 'onwer', 'private',
                                            'Private Owner', 'Original Owner', 'First owner', 'By Owner', 'Private owner',
                                            'Private Party Seller'])
mask2 = X.Seller_Status.astype('str').isin(['Dealer', 'Peter Root', 'Carrigan Motor Group', 'SHORELINE HARLEY -  DAVIDSON',
                                            'JOHNNY RAY RICHLAND    TRIKE ON AMERICA', 'BikesforDutch', 'Pawnbroker', 'Ricky', 
                                            'Customs by Barry, 40+ years successful building', 'wholesale'])
mask3 = X.Seller_Status.astype('str').isin(['factory', 'manufacturer'])
mask4 = pd.isnull(X['Seller_Status'])

X['Seller_Status_adj'] = np.nan
X['Seller_Status_adj'][mask1] = 'Owner'
X['Seller_Status_adj'][mask2] = 'Dealer'
X['Seller_Status_adj'][mask3] = 'Manufacturer'
X['Seller_Status_adj'][mask4] = 'NAN'
X['Seller_Status_adj'][~mask1 & ~mask2 & ~mask3 & ~mask4] = 'Other'

le4 = LabelEncoder().fit(X['Seller_Status_adj'])
X['Seller_Status_enc'] = le4.transform(X['Seller_Status_adj'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

# валидация

In [61]:
X_train = X.drop(['Model_Year', 'Feedback_Perc'], axis=1).select_dtypes(['int', 'float', 'int64']).loc[itr]
X_test  = X.drop(['Model_Year', 'Feedback_Perc'], axis=1).select_dtypes(['int', 'float', 'int64']).loc[ite]

In [62]:
X_train[:5]

Unnamed: 0,Mileage,Vehicle_Title,Watch_Count,N_Reviews,Bid_Count,Condition_enc,Make_enc,Vehicle_Tile_enc,Seller_Status_enc,Vehicle_Life
0,16000.0,0.0,0.0,2427,28.0,1,26,0,4,5.0
2,25763.0,0.0,0.0,136,26.0,1,10,0,2,48.0
3,33142.0,0.0,0.0,2920,11.0,1,26,0,0,9.0
5,0.0,0.0,0.0,412,1.0,1,26,0,4,46.0
7,17868.0,0.0,0.0,1159,6.0,1,26,0,4,43.0


In [63]:
model = RandomForestRegressor()
# model = Ridge()

score_mse = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error')
score_rmse = score_mse ** 0.5
print(np.mean(score_rmse))

7619.99638795


# проверка на тестовых данных
rmse на тестовой выборке должно получиться меньше 5000

In [64]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

def rmse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2) ** 0.5

rmse(y_test, y_pred)

6290.2875192969086