In [1]:
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 

In [2]:
#Read csv file from German to English Language
cars=pd.read_csv('autos.csv',encoding = "ISO-8859-1" )

In [3]:
#Shape of the data
cars.shape

(371528, 20)

In [4]:
#Renaming the column names

cars_col = ['Date_Crawled','Name','Seller','Offer_Type','Price','Ab_test','Vehicle_Type',
           'Year_of_Registration','Gear_box','Power_ps','Model','Kilometer','Month_of_Registration',
           'Fuel_Type','Brand','Not_repaired_damage','Date_Created','No_of_pictures','Postal_Code',
           'Last_Seen']
cars.columns = cars_col

In [5]:
#Drop a column after observing the record difference
#As we can see its a biased data and model will only learn which has the largest value
cars.Seller.value_counts()

privat        371525
gewerblich         3
Name: Seller, dtype: int64

In [6]:
#Now try for offer Type and observe the record difference
cars.Offer_Type.value_counts()

Angebot    371516
Gesuch         12
Name: Offer_Type, dtype: int64

In [7]:
#Now try for Ab_test and observe the record difference
#Here the values are not biased so this will defnitely play an important role in analysis
cars.Ab_test.value_counts()

test       192585
control    178943
Name: Ab_test, dtype: int64

In [8]:
#Now try for Vehicle_Type and observe the record difference
#Here the values are not biased so this is also an important variable for the prediction
cars.Vehicle_Type.value_counts()

limousine     95894
kleinwagen    80023
kombi         67564
bus           30201
cabrio        22898
coupe         19015
suv           14707
andere         3357
Name: Vehicle_Type, dtype: int64

In [9]:
#dropping the following columns
cars.drop(['Seller','Offer_Type','No_of_pictures','Name','Date_Crawled','Date_Created','Last_Seen',
           'Postal_Code','Kilometer'],axis=1, inplace=True)

In [10]:
cars.shape

(371528, 11)

In [11]:
#To check what all the variables have null values
cars.isnull().sum()

Price                        0
Ab_test                      0
Vehicle_Type             37869
Year_of_Registration         0
Gear_box                 20209
Power_ps                     0
Model                    20484
Month_of_Registration        0
Fuel_Type                33386
Brand                        0
Not_repaired_damage      72060
dtype: int64

In [12]:
#Now try for Gear_box and observe the record difference
#Here the values are not biased so this can be an important variable
cars.Gear_box.value_counts()

manuell      274214
automatik     77105
Name: Gear_box, dtype: int64

In [13]:
#Grouping the Brand and based on the gear count will check the gear type with max values
cars.groupby("Brand")["Gear_box"].value_counts().head()

Brand       Gear_box 
alfa_romeo  manuell       2064
            automatik      146
audi        manuell      20841
            automatik    10698
bmw         manuell      25323
Name: Gear_box, dtype: int64

In [14]:
#To fill the missing data of gear box by most occuring geartype with respect to its brand
groups_max=dict()
Brand_names= cars.Brand.unique()
Brand_names

#Through for loop fill the values of the gear box w.r.t to the Brand names
for i in Brand_names:
    gear_values=cars[cars.Brand==i]['Gear_box'].value_counts()
    groups_max[i]=gear_values.index[0]

In [15]:
#Now check if the location of the brand name and gearbox value is null then fill the 
#value with the max count
for i in Brand_names:
    cars.loc[(cars.Brand == i) & (cars.Gear_box.isnull()) ,"Gear_box" ] = groups_max[i]

In [16]:
#check if gear box has any missing values
cars.Gear_box.isnull().sum()

0

In [17]:
#Fill the NA values for the Fuel_type variable with most frequent values
cars.Fuel_Type.value_counts()

#As benzin is most frequently occured filling the NA values with 'benzin'
cars['Fuel_Type'].fillna('benzin',inplace=True)

#Check if any null value in present in Fuel type
cars.Fuel_Type.isnull().sum()

0

In [18]:
#Fill the NA values for the Not_repaired_damage variables with most frquent values
cars.Not_repaired_damage.value_counts()

#As nein is most frequently occured filling the NA values with 'nein'
cars['Not_repaired_damage'].fillna('nein',inplace=True)

#Check if any null value in present in Fuel type
cars.Not_repaired_damage .isnull().sum()

0

In [19]:
#Fill the NA values for the Model variable with most frquent values
cars.Model.value_counts()

#As golf is most frequently occured filling the NA values with 'golf'
cars['Model'].fillna('golf',inplace=True)

#Check if any null value in present in Fuel type
cars.Model.isnull().sum()

0

In [20]:
#Its time to fill the vehicle type before that lets check the unique values 
cars.Vehicle_Type.unique()

array([nan, 'coupe', 'suv', 'kleinwagen', 'limousine', 'cabrio', 'bus',
       'kombi', 'andere'], dtype=object)

In [21]:
#Fill the values with the same logic implemented for the gear type
#Based on the Fuel Type we will fill the vehicle type with the max value counts
group_max = dict()
fuel_names = cars.Fuel_Type.unique()
fuel_names

for i in fuel_names:
    vehicle_val = cars[cars.Fuel_Type==i]['Vehicle_Type'].value_counts()
    group_max[i]=vehicle_val.index[0]

In [22]:
for i in fuel_names:
    cars.loc[(cars.Fuel_Type==i) & (cars.Vehicle_Type.isnull()),"Vehicle_Type"] = group_max[i]

In [23]:
#Check for all the variables if it is null
cars.isnull().sum()

Price                    0
Ab_test                  0
Vehicle_Type             0
Year_of_Registration     0
Gear_box                 0
Power_ps                 0
Model                    0
Month_of_Registration    0
Fuel_Type                0
Brand                    0
Not_repaired_damage      0
dtype: int64

In [24]:
#Dimensions of the dataframe
cars.shape

(371528, 11)

In [25]:
#First few records of the dataframe
cars.head()

Unnamed: 0,Price,Ab_test,Vehicle_Type,Year_of_Registration,Gear_box,Power_ps,Model,Month_of_Registration,Fuel_Type,Brand,Not_repaired_damage
0,480,test,kleinwagen,1993,manuell,0,golf,0,benzin,volkswagen,nein
1,18300,test,coupe,2011,manuell,190,golf,5,diesel,audi,ja
2,9800,test,suv,2004,automatik,163,grand,8,diesel,jeep,nein
3,1500,test,kleinwagen,2001,manuell,75,golf,6,benzin,volkswagen,nein
4,3600,test,kleinwagen,2008,manuell,69,fabia,7,diesel,skoda,nein


In [26]:
#Filling the categorical variable through get_dummies
cars_dummies = pd.get_dummies(cars, columns=['Ab_test','Vehicle_Type','Gear_box','Model',
                                             'Fuel_Type','Brand','Not_repaired_damage'],drop_first=True)
cars_dummies.head()

Unnamed: 0,Price,Year_of_Registration,Power_ps,Month_of_Registration,Ab_test_test,Vehicle_Type_bus,Vehicle_Type_cabrio,Vehicle_Type_coupe,Vehicle_Type_kleinwagen,Vehicle_Type_kombi,...,Brand_skoda,Brand_smart,Brand_sonstige_autos,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,Not_repaired_damage_nein
0,480,1993,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
1,18300,2011,190,5,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9800,2004,163,8,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1500,2001,75,6,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,3600,2008,69,7,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [27]:
#Filtering the outliers from the files before that first check the year of registration and Prices values
cars_final = cars_dummies[(cars_dummies.Year_of_Registration < 2020)  & (cars_dummies.Year_of_Registration > 1947)]
cars_final = cars_dummies[(cars_dummies.Price > 500) & (cars_dummies.Price < 150000)]

In [28]:
cars_final.head()

Unnamed: 0,Price,Year_of_Registration,Power_ps,Month_of_Registration,Ab_test_test,Vehicle_Type_bus,Vehicle_Type_cabrio,Vehicle_Type_coupe,Vehicle_Type_kleinwagen,Vehicle_Type_kombi,...,Brand_skoda,Brand_smart,Brand_sonstige_autos,Brand_subaru,Brand_suzuki,Brand_toyota,Brand_trabant,Brand_volkswagen,Brand_volvo,Not_repaired_damage_nein
1,18300,2011,190,5,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9800,2004,163,8,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1500,2001,75,6,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,3600,2008,69,7,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
5,650,1995,102,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
#Splitting the data into train and test
cars_final_X = cars_final.iloc[:,1:].values
cars_final_y = cars_final.iloc[:,0].values

X_train, X_test_val, y_train, y_test_val = train_test_split(cars_final_X, cars_final_y, test_size=0.2,
                                                           random_state = 0)

In [42]:
#Feature scaling for Linear regression, Decision Tree
#though feature scaling is not required if you are running XGBoost algorithm
sc=StandardScaler()
X_train= sc.fit_transform(X_train)
X_test_val = sc.transform(X_test_val)

In [43]:
#Implementing the linear regression
lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [44]:
#Prediction of the values
y_pred=lr.predict(X_test_val)
y_pred[0:5]

array([3366.53520977, 2391.77117611, 1228.86026112, 3572.47521958,
       1660.6760413 ])

In [46]:
#Checking with the actual results
y_test_val[0:5]

array([2000, 2100, 3750, 2555, 1800], dtype=int64)

In [47]:
#Implementing the same with the decision tree algorithm
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)

#Fit the regressor object to the dataset
regressor.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [48]:
#Predicting a new result
y_pred = regressor.predict(X_test_val)

In [49]:
y_pred[0:5]

array([ 999.        , 1697.66666667, 3495.        , 2000.        ,
       1156.66666667])

In [50]:
#Checking with the actual results
y_test_val[0:5]

array([2000, 2100, 3750, 2555, 1800], dtype=int64)

In [52]:
#Applying k fold cross validation - to evaluate the model performance
from sklearn.model_selection import cross_val_score
r2_score = cross_val_score(estimator=regressor,X = X_train, y= y_train, cv=10)
r2_score

array([0.74675698, 0.75827967, 0.73029558, 0.72081465, 0.73099927,
       0.7108417 , 0.73705418, 0.72672834, 0.74337847, 0.68528574])

In [53]:
r2_score.mean() #r2_score mean

0.7290434591586735

In [54]:
r2_score.std() #low variance and low bias

0.019397672792966336

In [33]:
#Fitting XG Boost to the training set
from xgboost import XGBRegressor
regressor1 =  XGBRegressor()
regressor1.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [37]:
#Prediction on the test set
y_pred = regressor1.predict(X_test_val)
y_pred[0:5]

array([2237.9785, 1685.9021, 2450.9836, 2455.692 , 1198.1559],
      dtype=float32)

In [38]:
y_test_val[0:5]

array([2000, 2100, 3750, 2555, 1800], dtype=int64)

In [39]:
#Applying k fold cross validation - to evaluate the model performance
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=regressor1,X = X_train, y= y_train, cv=10)
accuracies

array([0.82669956, 0.84859458, 0.8376044 , 0.8225659 , 0.83465801,
       0.82419618, 0.82901845, 0.82992874, 0.83150153, 0.8259521 ])

In [40]:
accuracies.mean()

0.8310719456005028

In [41]:
accuracies.std()

0.007302350478875902