In [35]:
# pandas
import pandas as pd
# numpy
import numpy as np
# Scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import Normalizer
from sklearn.metrics import confusion_matrix
# matplotlib
import matplotlib.pyplot as plt
# models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors

In [15]:
# Read file data
cardata = pd.read_csv("c:\\Users\\Sasha\\Desktop\\Py\\Git\\CarPrice\\CarPrice\\DataSet\\cars.csv")

In [16]:
# Check data composition
cardata.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [4]:
# check availabe columns, data types, null values
cardata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   color              38531 non-null  object 
 4   odometer_value     38531 non-null  int64  
 5   year_produced      38531 non-null  int64  
 6   engine_fuel        38531 non-null  object 
 7   engine_has_gas     38531 non-null  bool   
 8   engine_type        38531 non-null  object 
 9   engine_capacity    38521 non-null  float64
 10  body_type          38531 non-null  object 
 11  has_warranty       38531 non-null  bool   
 12  state              38531 non-null  object 
 13  drivetrain         38531 non-null  object 
 14  price_usd          38531 non-null  float64
 15  is_exchangeable    38531 non-null  bool   
 16  location_region    385

In [20]:
corr_matrix = cardata.corr()

In [21]:
# display of correlation between price and features
corr_matrix["price_usd"].sort_values(ascending=False)

price_usd           1.000000
year_produced       0.705511
feature_7           0.498547
feature_3           0.470929
feature_6           0.451714
feature_8           0.449131
feature_5           0.434471
feature_2           0.338166
feature_4           0.336143
number_of_photos    0.316859
engine_capacity     0.296597
has_warranty        0.285532
feature_9           0.266156
feature_1           0.255806
up_counter          0.057382
duration_listed     0.033524
is_exchangeable    -0.000503
engine_has_gas     -0.062528
feature_0          -0.223896
odometer_value     -0.421204
Name: price_usd, dtype: float64

In [14]:
cardata.drop(["feature_5", "feature_2", "feature_4", "number_of_photos", "engine_capacity", "has_warranty", "feature_9", "feature_1", "up_counter", "duration_listed", "is_exchangeable", "engine_has_gas", "feature_0", "odometer_value"], axis=1, inplace=True)


KeyError: "['feature_5' 'feature_2' 'feature_4' 'number_of_photos' 'engine_capacity'\n 'has_warranty' 'feature_9' 'feature_1' 'up_counter' 'duration_listed'\n 'is_exchangeable' 'engine_has_gas' 'feature_0' 'odometer_value'] not found in axis"

In [11]:
cardata.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,year_produced,engine_fuel,engine_type,body_type,state,drivetrain,price_usd,location_region,feature_3,feature_6,feature_7,feature_8
0,Subaru,Outback,automatic,silver,2010,gasoline,gasoline,universal,owned,all,10900.0,Минская обл.,True,False,True,True
1,Subaru,Outback,automatic,blue,2002,gasoline,gasoline,universal,owned,all,5000.0,Минская обл.,False,False,False,False
2,Subaru,Forester,automatic,red,2001,gasoline,gasoline,suv,owned,all,2800.0,Минская обл.,False,False,False,True
3,Subaru,Impreza,mechanical,blue,1999,gasoline,gasoline,sedan,owned,all,9999.0,Минская обл.,False,False,False,False
4,Subaru,Legacy,automatic,black,2001,gasoline,gasoline,universal,owned,all,2134.11,Гомельская обл.,True,False,False,False


In [22]:
X = cardata[["year_produced", "feature_7", "feature_3", "feature_6"]]
Y = cardata.price_usd
X = pd.get_dummies(X)

In [23]:
X[["feature_7one", "feature_3one", "feature_6one"]] = X[["feature_7", "feature_3", "feature_6"]].astype(int)

In [24]:
X.head()

Unnamed: 0,year_produced,feature_7,feature_3,feature_6,feature_7one,feature_3one,feature_6one
0,2010,True,True,False,1,1,0
1,2002,False,False,False,0,0,0
2,2001,False,False,False,0,0,0
3,1999,False,False,False,0,0,0
4,2001,False,True,False,0,1,0


In [25]:
Y.head()

0    10900.00
1     5000.00
2     2800.00
3     9999.00
4     2134.11
Name: price_usd, dtype: float64

In [26]:
X.drop(["feature_7", "feature_3", "feature_6"], axis=1, inplace=True)

In [27]:
X.head()

Unnamed: 0,year_produced,feature_7one,feature_3one,feature_6one
0,2010,1,1,0
1,2002,0,0,0
2,2001,0,0,0
3,1999,0,0,0
4,2001,0,1,0


In [28]:
X_train_set, X_test_set, Y_train_set, Y_test_set = train_test_split(X,Y, test_size=0.2, random_state=42)

In [29]:
X_train_set.head()

Unnamed: 0,year_produced,feature_7one,feature_3one,feature_6one
37935,2003,0,0,0
28568,1995,0,0,0
24311,2013,1,1,0
19393,2002,0,0,0
2540,2005,0,0,1


In [30]:
gbr = ensemble.GradientBoostingRegressor(n_estimators=250,learning_rate=0.1,max_depth=20,min_samples_split=4,min_samples_leaf=6,max_features=0.6,loss='huber')

In [31]:
gbr.fit(X_train_set,Y_train_set)

GradientBoostingRegressor(loss='huber', max_depth=20, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=250)

In [33]:
y_predict = gbr.predict(X_train_set)

In [34]:
mae_train = mean_absolute_error(Y_train_set, y_predict)
print("Training Set Mean Absolute Error : %2f" %mae_train)

Training Set Mean Absolute Error : 2159.297508


In [70]:
round(mean_absolute_error(Y_train_set, y_predict),1)

2159.3

In [42]:
from sklearn.metrics import mean_squared_error
mean_squared_error(Y_train_set, y_predict)

14165652.483384391

In [44]:
from sklearn.metrics import r2_score
r2_score(Y_train_set, y_predict)

0.658049917154192

In [47]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

In [48]:
lr.fit(X_train_set,Y_train_set)

LinearRegression(normalize=True)

In [51]:
y_pred = lr.predict(X_train_set)
y_pred

array([ 5453.57624932,  1856.15896531, 12889.91281189, ...,
       12206.50886119, 13339.58997239,  5903.25340982])

In [52]:
mean_absolute_error(Y_train_set, y_pred)

2661.493594737426

In [60]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(gbr, X_train_set, Y_train_set, cv=3, scoring="accuracy"))

[nan nan nan]
