In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')

In [77]:
data = pd.read_csv('true_car_listings.csv')

# Dealing with outliers

In [78]:
#Handling Outliers on feature price
sort_price = sorted(data["Price"])
Q1, Q3= np.percentile(sort_price,[10,90])
print(Q1,Q3)
IQR=Q3-Q1
IQR

9000.0 35998.0


26998.0

In [79]:
lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR
lower_limit,upper_limit

(-31497.0, 76495.0)

In [80]:
data=data[(data.Price <= upper_limit) 
                  & (data.Price >= lower_limit)]
data.shape

(848250, 8)

In [81]:
#Handling Outliers on feature Mileage
sort_mileage = sorted(data["Mileage"])
Q1, Q3= np.percentile(sort_mileage,[10,90])
print(Q1,Q3)
IQR=Q3-Q1
print(IQR)
lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR
lower_limit,upper_limit


12327.0 109776.09999999998
97449.09999999998


(-133846.64999999997, 255949.74999999994)

In [82]:
data=data[(data.Mileage<= upper_limit) 
                  & (data.Mileage >= lower_limit)]
data.shape

(847166, 8)

# Dealing with categorical values

In [83]:
dicta = dict()
#mapping values
def mapping(x):
  j=1
  for i in x.value_counts().sort_values(ascending=False).index:
    dicta[i] = j
    j+=1
  return dicta

In [None]:
mapping(data['City'])
data['City'] = data['City'].map(dicta)

In [None]:
mapping(data['State'])
data['State'] = data['State'].map(dicta)

In [None]:
mapping(data['Make'])
data['Make'] = data['Make'].map(dicta)

In [None]:
mapping(data['Model'])
data['Model'] = data['Model'].map(dicta)

In [None]:
data.drop(["Vin"],axis=1,inplace=True)
data.drop_duplicates(keep=False,inplace=True)

# Building Model


In [None]:
X=data.drop(['Price'],axis=1)
Y=data['Price']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.7,random_state=42)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X.values,Y.values,train_size=0.7,random_state=42)

# GB

In [65]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(learning_rate=0.16, max_depth=8,
                          min_samples_leaf=10, n_estimators=1, random_state=42,
                          verbose=True)
gb.fit(X_train,Y_train)

      Iter       Train Loss   Remaining Time 
         1   102926667.9305            0.00s


GradientBoostingRegressor(learning_rate=0.16, max_depth=8, min_samples_leaf=10,
                          n_estimators=1, random_state=42, verbose=True)

In [66]:
predict = gb.predict(X_test)

In [None]:
print('MAE:',metrics.mean_absolute_error(Y_test,predict))
print('MSE:',metrics.mean_squared_error(Y_test,predict))
print('RMSE:',np.sqrt(metrics.mean_squared_error(Y_test,predict)))
print(f'Train Score : {gb.score(X_train, Y_train) * 100:.2f}% and Test Score : {gb.score(X_test, Y_test) * 100:.2f}% using XGBRegressor.')
print('r2 score:',metrics.r2_score(Y_test,predict))

In [None]:
gb2 = GradientBoostingRegressor(random_state=42,verbose =True,min_samples_leaf=10,max_depth = 8)
gb2.fit(X_train,Y_train)
predict = gb2.predict(X_test)
print('MAE:',metrics.mean_absolute_error(Y_test,predict))
print('MSE:',metrics.mean_squared_error(Y_test,predict))
print('RMSE:',np.sqrt(metrics.mean_squared_error(Y_test,predict)))
print(f'Train Score : {gb2.score(X_train, Y_train) * 100:.2f}% and Test Score : {gb2.score(X_test, Y_test) * 100:.2f}% using XGBRegressor.')
print('r2 score:',metrics.r2_score(Y_test,predict))

In [71]:
import pickle
file = open('gb.pkl', 'wb')

# dump information to that file
pickle.dump(gb2, file)
model = open('gb.pkl','rb')

In [72]:
m = pickle.load(model)
y_prediction = m.predict(X_test)
metrics.r2_score(Y_test, y_prediction)

0.8532152006208569

# XGBoost Regression

In [None]:
train = xgb.DMatrix(X_train, Y_train)
test = xgb.DMatrix(X_test, Y_test)

# We need to define parameters as dict
params = {
    "colsample_bylevel":0.7, 
    "colsample_bynode":1,
    "colsample_bytree":0.4, 
    "gamma":0.4,
    "learning_rate":0.3,
    "max_depth":10,
    "min_child_weight":5,
    "n_estimators":1000,
    "n_jobs":8,
    "num_parallel_tree":1,
    "random_state":30,
    "reg_alpha":0.005,
    "reg_lambda":1,
    "subsample":0.8999999999999999
}
# training, we set the early stopping rounds parameter
model_xgb = xgb.train(params, 
          train, evals=[(train, "train"), (test, "validation")], 
          num_boost_round=100, early_stopping_rounds=20)

model_xgb.predict(xgb.DMatrix(X_test))


In [None]:
# save to JSON
model_xgb.save_model("xgb-dm.json")
# save to text format
model_xgb.save_model("xgb-dm.txt")

In [None]:
model_xgb_2 = xgb.Booster()
model_xgb_2.load_model("xgb-dm.json")

In [None]:
Y_prediction = model_xgb_2.predict(xgb.DMatrix(X_test))
print(Y_prediction)
metrics.r2_score(Y_test, Y_prediction)

In [None]:
from xgboost import XGBRegressor
xgb_model=XGBRegressor(colsample_bylevel=0.7, colsample_bynode=1, colsample_bytree=0.4, 
             gamma=0.4, learning_rate=0.3,max_depth=10, min_child_weight=5, 
             n_estimators=1000, n_jobs=8,num_parallel_tree=1,  random_state=30,
             reg_alpha=0.005, reg_lambda=1,subsample=0.8999999999999999)
xgb_model.fit(X_train, Y_train)
predict = xgb_model.predict(X_test)

In [None]:
from sklearn import metrics
print('MAE:',metrics.mean_absolute_error(Y_test,predict))
print('MSE:',metrics.mean_squared_error(Y_test,predict))
print('RMSE:',np.sqrt(metrics.mean_squared_error(Y_test,predict)))
print(f'Train Score : {xgb_model.score(X_train, Y_train) * 100:.2f}% and Test Score : {xgb_model.score(X_test, Y_test) * 100:.2f}% using XGBRegressor.')
print('r2 score:',metrics.r2_score(Y_test,predict))

In [None]:
# save in JSON format
xgb_model.save_model("xgb.json")
# save in text format
xgb_model.save_model("xgb.txt")

In [None]:
model = XGBRegressor()
model.load_model("xgb.json")

In [None]:
Y_prediction = model.predict(X_test)
metrics.r2_score(Y_test, Y_prediction)

In [None]:
from xgboost import Booster
model_xgb_2 = Booster()
model_xgb_2.load_model("xgb.json")

In [None]:
Y_prediction = model_xgb_2.predict(X_test)
metrics.r2_score(Y_test, Y_prediction)