# Indian Car Data Model Development

In [1]:
import pandas as pd
from modelprocessing.functions import getBestModel
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from xgboost import XGBRegressor
import skops.io as sio
import seaborn as sns

In [2]:
df = pd.read_csv('/home/anuraaga/Documents/Projects/Project-PredthePrice/docker/data/indian_car.csv', index_col=0)
df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [3]:
df.isna().sum()

name          0
company       0
year          0
Price         0
kms_driven    0
fuel_type     0
dtype: int64

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,815.0,2012.442945,4.005079,1995.0,2010.0,2013.0,2015.0,2019.0
Price,815.0,401793.33865,381588.817401,30000.0,175000.0,299999.0,490000.0,3100000.0
kms_driven,815.0,46277.096933,34318.459638,0.0,27000.0,41000.0,56879.0,400000.0


In [5]:
df.columns

Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')

In [6]:
df['company'].unique()

array(['Hyundai', 'Mahindra', 'Ford', 'Maruti', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)

In [7]:
df=df[df['Price']<6000000]

In [8]:
X = df.drop('Price', axis=1)
y = df['Price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [10]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
ohe.fit(X[['name', 'company', 'fuel_type']])

In [11]:
column_transformer = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']),
                                             remainder='passthrough')

In [None]:
getBestModel(X_train, X_test, y_train, y_test, column_transformer)

In [None]:
from modelprocessing.functions import getBestState
getBestState(X, y, 0.2, column_transformer)

In [17]:
df.to_csv('/home/anuraaga/Documents/DataCodes/Projects/pricePredictor/dataset/dataset_car/indian_car.csv')

In [None]:
# run it in google colab
# xg = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', device='cuda', verbosity=0)
# param_grid = {
#     'booster': ['gbtree', 'gblinear', 'dart'],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2]
# }
# grid = GridSearchCV(xg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=10)
# pipetest = make_pipeline(column_transformer,  grid)
# pipetest.fit(X_train, y_train)
# print(pipetest.named_steps['gridsearchcv'].best_params_)
# xg = XGBRegressor(booster= 'dart',learning_rate= 0.2,max_depth= 7,verbosity=0, tree_method='gpu_hist', predictor='gpu_predictor', device='cuda')
# pipe = make_pipeline(column_transformer, xg)
# pipe.fit(X_train, y_train)

In [None]:
pipe = sio.load('/home/anuraaga/Documents/Projects/Project-PredthePrice/docker/models/indiancarmodel_pipeline.skops', trusted=True)

In [14]:
pipe.score(X_train, y_train)

0.9692940120070288

In [15]:
pipe.score(X_test, y_test)

0.890513537146146

In [16]:
from sklearn.metrics import r2_score
y_pred = pipe.predict(X_test)
print('R2 score %.2f' % r2_score(y_test, y_pred))

R2 score 0.89
