In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [46]:
df = pd.read_csv("../datasets/cars.csv")
df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [47]:
print(df.columns)

Index(['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_has_gas',
       'engine_type', 'engine_capacity', 'body_type', 'has_warranty', 'state',
       'drivetrain', 'price_usd', 'is_exchangeable', 'location_region',
       'number_of_photos', 'up_counter', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8', 'feature_9', 'duration_listed'],
      dtype='object')


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   manufacturer_name  38531 non-null  category
 1   model_name         38531 non-null  category
 2   transmission       38531 non-null  category
 3   color              38531 non-null  category
 4   odometer_value     38531 non-null  int64   
 5   year_produced      38531 non-null  int64   
 6   engine_fuel        38531 non-null  category
 7   engine_has_gas     38531 non-null  bool    
 8   engine_type        38531 non-null  category
 9   engine_capacity    38521 non-null  float64 
 10  body_type          38531 non-null  category
 11  has_warranty       38531 non-null  bool    
 12  state              38531 non-null  category
 13  drivetrain         38531 non-null  category
 14  price_usd          38531 non-null  float64 
 15  is_exchangeable    38531 non-null  bool    
 16  loca

In [48]:
df = df.dropna(subset = 'engine_capacity')

In [43]:
print(list(df.select_dtypes(['object']).columns))

['manufacturer_name', 'model_name', 'transmission', 'color', 'engine_fuel', 'engine_type', 'body_type', 'state', 'drivetrain', 'location_region']


In [44]:
pd.get_dummies(data = df, prefix ='_is', columns = list(df.select_dtypes(['object']).columns))

Unnamed: 0,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,number_of_photos,up_counter,feature_0,...,_is_owned,_is_all,_is_front,_is_rear,_is_Брестская обл.,_is_Витебская обл.,_is_Гомельская обл.,_is_Гродненская обл.,_is_Минская обл.,_is_Могилевская обл.
0,190000,2010,False,2.5,False,10900.00,False,9,13,False,...,True,True,False,False,False,False,False,False,True,False
1,290000,2002,False,3.0,False,5000.00,True,12,54,False,...,True,True,False,False,False,False,False,False,True,False
2,402000,2001,False,2.5,False,2800.00,True,4,72,False,...,True,True,False,False,False,False,False,False,True,False
3,10000,1999,False,3.0,False,9999.00,True,9,42,True,...,True,True,False,False,False,False,False,False,True,False
4,280000,2001,False,2.5,False,2134.11,True,14,7,False,...,True,True,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,290000,2000,False,3.5,False,2750.00,True,5,85,False,...,True,False,True,False,False,False,False,False,True,False
38527,321000,2004,False,2.2,False,4800.00,True,4,20,False,...,True,False,True,False,True,False,False,False,False,False
38528,777957,2000,False,3.5,False,4300.00,False,3,63,False,...,True,False,True,False,False,False,False,False,True,False
38529,20000,2001,False,2.0,False,4000.00,True,7,156,False,...,True,False,True,False,True,False,False,False,False,False


In [49]:
for col in list(df.select_dtypes(['object']).columns):
    df[col] = df[col].astype('category')

In [50]:
X = df.drop('price_usd', axis = 1)
y = pd.DataFrame(df['price_usd'])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=11)

In [52]:
XGBreg = xgboost.XGBRegressor(n_estimators=300, max_depth= 6, learning_rate=0.3, enable_categorical = True)
XGBreg.fit(X_train, y_train)

In [53]:
y_pred = XGBreg.predict(X_test)

In [54]:
MAE = mean_absolute_error(y_test, y_pred)
print(MAE)

976.8468166442119


In [67]:
np.linspace(0.1, 0.5, 5)

array([0.1, 0.2, 0.3, 0.4, 0.5])

In [71]:
xgboost.set_config(verbosity=3)

In [74]:
params_xgb = {"max_depth": np.arange(2,12),
            "learning_rate": np.linspace(0.1, 0.5, 5),
            "n_estimators": [200, 300, 400, 500, 600]}

xgb = xgboost.XGBRegressor(random_state = 11, enable_categorical = True)
grid_xgb = GridSearchCV(estimator = xgb, param_grid = params_xgb, scoring = 'neg_mean_squared_error', cv = 10, verbose = 10, n_jobs = -1)

In [75]:
grid_xgb.fit(X_train, y_train)
print(grid_xgb.best_params_, grid_xgb.best_score_)

Fitting 10 folds for each of 250 candidates, totalling 2500 fits
[CV 5/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 3/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 4/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 1/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 6/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 2/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 7/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 8/10; 1/250] START learning_rate=0.1, max_depth=2, n_estimators=200.........
[CV 5/10; 1/250] END learning_rate=0.1, max_depth=2, n_estimators=200;, score=-3256771.839 total time=   0.5s
[CV 1/10; 1/250] END learning_rate=0.1, max_depth=2, n_estimators=200;, score=-3602156.426 total time=   0.5s
[CV 3/10; 1/250] END learning_rate=0.1, max_depth=2, n_estimators=2

KeyboardInterrupt: 

In [26]:
df2 = pd.get_dummies(data = df, prefix ='_is', columns = list(df.select_dtypes(['object']).columns))

In [36]:
df2 = df2.dropna(subset = 'engine_capacity')

In [38]:
X2 = df2.drop('price_usd', axis = 1)
y2 = pd.DataFrame(df2['price_usd'])

In [39]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, test_size = 0.2, random_state=11)

In [29]:
X_train2.head()

Unnamed: 0,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,is_exchangeable,number_of_photos,up_counter,feature_0,feature_1,...,_is_owned,_is_all,_is_front,_is_rear,_is_Брестская обл.,_is_Витебская обл.,_is_Гомельская обл.,_is_Гродненская обл.,_is_Минская обл.,_is_Могилевская обл.
33810,612345,1993,False,1.3,False,False,10,19,True,False,...,True,False,True,False,False,False,False,False,False,True
16086,250000,2000,False,1.8,False,False,11,1,False,True,...,True,False,True,False,False,True,False,False,False,False
28711,168000,2010,False,2.0,False,True,6,2,False,True,...,True,False,False,True,False,False,False,False,True,False
21695,285000,2006,False,2.0,False,False,6,1,False,False,...,True,False,True,False,True,False,False,False,False,False
27988,250000,1992,False,1.6,False,False,6,108,False,False,...,True,False,False,True,False,True,False,False,False,False


In [40]:
linreg = LinearRegression()
linreg.fit(X_train2, y_train2)

In [41]:
y_lin_pred = linreg.predict(X_test2)
MAE_lin = mean_absolute_error(y_test2, y_lin_pred)
print(MAE_lin)

1599.402020873425
