In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt

#libraries for preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

#libraries for evaluation
from sklearn.metrics import mean_squared_log_error,r2_score
from sklearn.model_selection import train_test_split


#libraries for models
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Lasso

In [3]:
data = pd.read_csv('FinalData.csv')
data

Unnamed: 0,City,Year,Shell,Volume,Mileage,Transmission,Rudder,Gear,CustomsCleared,Type Engine,Company,Model,Price
0,Уральск,2014,внедорожник,1.7,31000,механика,слева,полный привод,Да,бензин,Chevrolet,Niva,2900000
1,Нур-Султан (Астана),2014,седан,3.5,59800,автомат,слева,передний привод,Да,бензин,Kia,Cadenza,8600000
2,Тараз,1995,фургон,2.3,450650,механика,слева,задний привод,Да,дизель,Volkswagen,LT,1100000
3,Алматы,2018,седан,2.5,48000,типтроник,слева,передний привод,Да,бензин,Toyota,Camry,11300000
4,Караганда,1997,внедорожник,3.5,153000,автомат,справа,полный привод,Нет,бензин,Mitsubishi,Pajero,3350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16030,Костанай,2014,седан,1.6,160000,механика,слева,передний привод,Нет,бензин,Datsun,on-DO,1550000
16031,Сайхин,2008,внедорожник,1.8,175000,механика,слева,полный привод,Да,бензин,Chevrolet,Niva,1500000
16032,Нур-Султан (Астана),2014,седан,1.5,103000,механика,слева,передний привод,Да,бензин,Daewoo,Gentra,3300000
16033,Алматы,2014,внедорожник,4.0,87000,автомат,слева,полный привод,Да,бензин,Toyota,Land Cruiser Prado,18500000


In [4]:
num_col=['Year','Volume','Mileage','Price']
cat_cols=['City','Shell','Transmission','Rudder','Gear','CustomsCleared','Type Engine','Company','Model']

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16035 entries, 0 to 16034
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   City            16035 non-null  object 
 1   Year            16035 non-null  int64  
 2   Shell           16035 non-null  object 
 3   Volume          16035 non-null  float64
 4   Mileage         16035 non-null  int64  
 5   Transmission    16035 non-null  object 
 6   Rudder          16035 non-null  object 
 7   Gear            16035 non-null  object 
 8   CustomsCleared  16035 non-null  object 
 9   Type Engine     16035 non-null  object 
 10  Company         16035 non-null  object 
 11  Model           16035 non-null  object 
 12  Price           16035 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 1.6+ MB


In [6]:
le = preprocessing.LabelEncoder()
data[cat_cols]=data[cat_cols].apply(le.fit_transform)

In [7]:
data

Unnamed: 0,City,Year,Shell,Volume,Mileage,Transmission,Rudder,Gear,CustomsCleared,Type Engine,Company,Model,Price
0,167,2014,0,1.7,31000,2,0,2,0,0,12,617,2900000
1,118,2014,11,3.5,59800,0,0,1,0,0,37,232,8600000
2,152,1995,14,2.3,450650,2,0,0,0,4,67,517,1100000
3,15,2018,11,2.5,48000,4,0,1,0,0,66,237,11300000
4,80,1997,0,3.5,153000,0,1,2,1,0,49,635,3350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16030,93,2014,11,1.6,160000,2,0,1,1,0,17,933,1550000
16031,134,2008,0,1.8,175000,2,0,2,0,0,12,617,1500000
16032,118,2014,11,1.5,103000,2,0,1,0,0,15,442,3300000
16033,15,2014,0,4.0,87000,0,0,2,0,0,66,526,18500000


In [8]:
#scaling numerical data
#2014 1.7 31000
norm = StandardScaler()
data['Price'] = np.log(data['Price'])
data['Mileage'] = norm.fit_transform(np.array(data['Mileage']).reshape(-1,1))
data['Year'] = norm.fit_transform(np.array(data['Year']).reshape(-1,1))
data['Company'] = norm.fit_transform(np.array(data['Company']).reshape(-1,1))
data['Model'] = norm.fit_transform(np.array(data['Model']).reshape(-1,1))
data['City'] = norm.fit_transform(np.array(data['City']).reshape(-1,1))


#scaling target variable
q1,q3=(data['Price'].quantile([0.25,0.75]))
o1=q1-1.5*(q3-q1)
o2=q3+1.5*(q3-q1)
data=data[(data.Price>=o1) & (data.Price<=o2)]

In [12]:
data

Unnamed: 0,City,Year,Shell,Volume,Mileage,Transmission,Rudder,Gear,CustomsCleared,Type Engine,Company,Model,Price
0,1.251467,0.797898,0,1.7,-0.322426,2,0,2,0,0,-1.772740,0.751854,14.880221
1,0.502110,0.797898,11,3.5,-0.273051,0,0,1,0,0,-0.585937,-0.674261,15.967273
2,1.022072,-1.523983,14,2.3,0.397024,2,0,0,0,4,0.838227,0.381435,13.910821
3,-1.073068,1.286715,11,2.5,-0.293281,4,0,1,0,0,0.790755,-0.655740,16.240313
4,-0.079024,-1.279574,0,3.5,-0.113268,0,1,2,1,0,-0.016271,0.818530,15.024471
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16030,0.119785,0.797898,11,1.6,-0.101267,2,0,1,1,0,-1.535380,1.922380,14.253765
16031,0.746798,0.064673,0,1.8,-0.075551,2,0,2,0,0,-1.772740,0.751854,14.220976
16032,0.502110,0.797898,11,1.5,-0.198989,2,0,1,0,0,-1.630324,0.103620,15.009433
16033,-1.073068,0.797898,0,4.0,-0.226419,0,0,2,0,0,0.790755,0.414772,16.733281


In [104]:
#function to split dataset int training and test
def trainingData(df,n):
    X = df.iloc[:,n]
    y = df.iloc[:,-1:].values.T
    y=y[0]
    X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.9,test_size=0.1,random_state=0)
    return (X_train,X_test,y_train,y_test)

X_train,X_test,y_train,y_test=trainingData(data,list(range(len(list(data.columns))-1)))

In [105]:
#some of models will predict neg values so this function will remove that values
def remove_neg(y_test,y_pred):
    ind=[index for index in range(len(y_pred)) if(y_pred[index]>0)]
    y_pred=y_pred[ind]
    y_test=y_test[ind]
    y_pred[y_pred<0]
    return (y_test,y_pred)

#function for evaluation of model
def result(y_test,y_pred):
    r=[]
    r.append(mean_squared_log_error(y_test, y_pred))
    r.append(np.sqrt(r[0]))
    r.append(r2_score(y_test,y_pred))
    r.append(round(r2_score(y_test,y_pred)*100,4))
    return (r)

#dataframe that store the performance of each model
accu=pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score','Accuracy(%)']) 

# Linear Regression

In [106]:
#fitting model
LR=LinearRegression()
LR.fit(X_train,y_train)
y_pred=LR.predict(X_test)

In [107]:
y_test_1,y_pred_1=remove_neg(y_test,y_pred)
r1_lr=result(y_test_1,y_pred_1)
print('Coefficients: \n', LR.coef_)
print("MSLE : {}".format(r1_lr[0]))
print("Root MSLE : {}".format(r1_lr[1]))
print("R2 Score : {} or {}%".format(r1_lr[2],r1_lr[3]))
accu['Linear Regression']=r1_lr

Coefficients: 
 [-0.02282903  0.57015953 -0.02138235  0.4000288  -0.01388968 -0.07905333
  0.18207203  0.01003327 -0.62603203  0.01252148 -0.04060911  0.07054514]
MSLE : 0.0008224426139895425
Root MSLE : 0.02867826030270216
R2 Score : 0.7816710622226075 or 78.1671%


# KNN

In [108]:
KNN=KNeighborsRegressor(n_neighbors=5) 
KNN.fit(X_train,y_train)
y_pred=KNN.predict(X_test)

In [109]:
#model evaluation
r4_knn=result(y_test,y_pred)
print("MSLE : {}".format(r4_knn[0]))
print("Root MSLE : {}".format(r4_knn[1]))
print("R2 Score : {} or {}%".format(r4_knn[2],r4_knn[3]))
accu['KNN']=r4_knn

MSLE : 0.0005246158322848827
Root MSLE : 0.022904493713786445
R2 Score : 0.8623020561315176 or 86.2302%


# Random Forest

In [110]:
RFR = RandomForestRegressor(n_estimators=180,random_state=0, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
RFR.fit(X_train,y_train)
y_pred = RFR.predict(X_test)

In [111]:
r5_rf=result(y_test,y_pred)
print("MSLE : {}".format(r5_rf[0]))
print("Root MSLE : {}".format(r5_rf[1]))
print("R2 Score : {} or {}%".format(r5_rf[2],r5_rf[3]))
accu['RandomForest Regressor']=r5_rf

MSLE : 0.0002409961918974826
Root MSLE : 0.015524052045051981
R2 Score : 0.9393248659999041 or 93.9325%


# Decision Tree Regression

In [122]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor()
DTR.fit(X_train,y_train)
y_pred = DTR.predict(X_test)

In [123]:
r6_rf=result(y_test,y_pred)
print("MSLE : {}".format(r6_rf[0]))
print("Root MSLE : {}".format(r6_rf[1]))
print("R2 Score : {} or {}%".format(r6_rf[2],r6_rf[3]))
accu['Decision Tree Regression']=r6_rf

MSLE : 0.0005009085756749923
Root MSLE : 0.022380986923614253
R2 Score : 0.8726128624769408 or 87.2613%


# Lasso

In [141]:
lasso=Lasso(alpha=0.000001)
lasso.fit(X_train,y_train)
y_pred=lasso.predict(X_test)

In [140]:
y_test_3,y_pred_3=remove_neg(y_test,y_pred)
r3_lasso=result(y_test_3,y_pred_3)
print("MSLE : {}".format(r3_lasso[0]))
print("Root MSLE : {}".format(r3_lasso[1]))
print("R2 Score : {} or {}%".format(r3_lasso[2],r3_lasso[3]))
accu['Lasso Regression']=r3_lasso

MSLE : 0.0008224426817044308
Root MSLE : 0.0286782614832983
R2 Score : 0.7816710499523704 or 78.1671%
