In [1]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [2]:
car_df = pd.read_csv('quikr_car.csv')

In [3]:
#data cleaning
car_df['name']=car_df['name'].str.split().str.slice(start=0,stop=3).str.join(' ')

car_df=car_df[car_df['year'].str.isnumeric()]
car_df['year']=car_df['year'].astype(int)


car_df=car_df[car_df['Price']!='Ask For Price']
car_df['Price']=car_df['Price'].str.replace(',','').astype(int)

car_df['kms_driven']=car_df['kms_driven'].str.split().str.get(0).str.replace(',','')
car_df=car_df[car_df['kms_driven'].str.isnumeric()]
car_df['kms_driven']=car_df['kms_driven'].astype(int)

car_df=car_df[~car_df['fuel_type'].isna()]

In [4]:
#extracting x and y
x = car_df.drop(['Price'], axis = 1)
y = car_df['Price']

In [5]:
#creating one hot encoder object
one_hot_encoder=OneHotEncoder()
one_hot_encoder.fit(x[['name','company','fuel_type']])

column_trans=make_column_transformer((OneHotEncoder(categories=one_hot_encoder.categories_),
                                    ['name','company','fuel_type']),
                                    remainder='passthrough')

In [6]:
#to get random_state value where r2 score is maximum
scores=[]
for i in range(1000):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=i)
    model=LinearRegression()
    pipe=make_pipeline(column_trans,model)
    pipe.fit(x_train,y_train)
    y_pred=pipe.predict(x_test)
    scores.append(r2_score(y_test,y_pred))

max_random_state = np.argmax(scores)

In [7]:

#splitting the training and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,random_state=max_random_state)

In [8]:
#Train the model using Linear Regression
model=LinearRegression()

pipe=make_pipeline(column_trans,model)

pipe.fit(x_train,y_train)

In [9]:
y_pred=pipe.predict(x_test)
r2_score_val = r2_score(y_test,y_pred)
r2_score_val

0.720430951273447

In [10]:
import pickle
model_data = {'model': pipe, 'r2_score_val': r2_score_val,'df': car_df}
pickle.dump(model_data,open('model/model.pkl','wb'))