In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/d/faseehurrehman/quikr-cars/quikr.csv',encoding='cp1252')
df.head(2)

In [None]:
df.shape

In [None]:
df.info()

## Steps to Take
- names are pretty inconsistent
- names have company names attached to it
- some names are spam like 'Maruti Ertiga showroom condition with' and 'Well mentained Tata Sumo'
- company: many of the names are not of any company like 'Used', 'URJENT', and so on.
- year has many non-year values
- year is in object. Change to integer
- Price has Ask for Price
- Price has commas in its prices and is in object
- kms_driven has object values with kms at last.
- It has nan values and two rows have 'Petrol' in them
- fuel_type has nan values

In [None]:
# Creating backup copy
backup = df.copy()

## Cleaning Data

#### 'year' column

In [None]:
df.year.unique()

In [None]:
df.year.str.isnumeric() # to print only numerical values

In [None]:
df[df.year.str.isnumeric()]

In [None]:
df = df[df.year.str.isnumeric()] # storing only numeric data in year column
df.shape

In [None]:
# now convert object data type into integer data type
df.year.dtype 

In [None]:
df['year'] = df.year.astype(int)

In [None]:
df.year.dtype

In [None]:
df['year'].head(10)

#### 'price' column

In [None]:
df.head(2)

In [None]:
df.price.unique()

In [None]:
# Removing 'Ask for price'

df[df['price']=='Ask For Price'] # dataframe which contains 'Ask For Price' in price column

In [None]:
df = df[df['price']!='Ask For Price'] # keeping the dataframe which has no 'Ask For price' in price column

In [None]:
df.shape

In [None]:
df.price.head(10)

In [None]:
# Now remove commas and convert into integer

df['price'] = df.price.str.replace(',','').astype(int)

In [None]:
df.price.head(10)

In [None]:
df.price.dtype

In [None]:
df.shape

#### 'kms_covered' column

In [None]:
df.kms_covered.unique()

In [None]:
# Splitting strings, removing kms and commas

df['kms_covered'] = df.kms_covered.str.split().str.get(0).str.replace(',','')

In [None]:
df['kms_covered'].head(10)

In [None]:
# Just keeping numerical values
df= df[df['kms_covered'].str.isnumeric()]

In [None]:
# Converting into integer
df['kms_covered'] = df['kms_covered'].astype(int)

In [None]:
df.kms_covered.dtype

In [None]:
df.kms_covered.head(5)

In [None]:
df.describe()

In [None]:
df.shape

#### 'fuel_type' column

In [None]:
df.fuel_type.unique()

In [None]:
df.isna().sum()

In [None]:
df = df[~df['fuel_type'].isna()] # removing nan values from fuel type

#### 'company' column

In [None]:
df.company.unique() # Don't need to do cleaning of column 'company'

#### 'name' column

In [None]:
df.name.unique()

In [None]:
df['name'] = df.name.str.split(' ').str.slice(0,3).str.join(' ') # Changing car names. Keeping only the first three words

In [None]:
df.name.head(10)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
## Reset Index

df = df.reset_index(drop = True)
df.head()

In [None]:
df.to_csv('Cleaned_cars.csv')

In [None]:
df.info()

## Visualization of Data

In [None]:
sns.pairplot(df)

In [None]:
sns.barplot(x = 'year', y ='price', data = df)
plt.show()

In [None]:
df.columns

In [None]:
# Comparing relationship between company and price

plt.subplots(figsize=(15,7))
ax=sns.boxplot(x='company',y='price',data=df)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

In [None]:
# Comparing relationship between year and price

plt.subplots(figsize=(20,10))
ax=sns.swarmplot(x='year',y='price',data=df)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

In [None]:
# Comparing relationship between kms driven and price

sns.relplot(x='kms_covered',y='price',data=df,height=7,aspect=1.5)
plt.show()

In [None]:
#Checking relationship of Fuel Type with Price

plt.subplots(figsize=(14,7))
sns.boxplot(x='fuel_type',y='price',data=df)
plt.show()

In [None]:
# Relationship of Price with FuelType, Year and Company mixed

ax=sns.relplot(x='company',y='price',data=df,hue='fuel_type',size='year',height=7,aspect=2)
ax.set_xticklabels(rotation=40,ha='right')
plt.show()

## Outliers Removal

In [None]:
# 'price' column

df['price'].plot.density(color='green')

In [None]:
df = df[df['price']<6000000] # Removing outlier having price above 6000000

## Extracting Training Data

In [None]:
X=df[['name','company','year','kms_covered','fuel_type']]
y=df['price']

## Applying Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [None]:
# Creating an OneHotEncoder object

ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

## Creating a column transformer to transform categorical columns

In [None]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')

In [None]:
# Linear Regression Model
lr=LinearRegression()

In [None]:
# Making a pipeline
pipe=make_pipeline(column_trans,lr)

In [None]:
# Fitting the model
pipe.fit(X_train,y_train)

In [None]:
y_pred=pipe.predict(X_test)

In [None]:
# Checking R2 Score

r2_score(y_test,y_pred)

### Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.92 as r2_score

In [None]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [None]:
np.argmax(scores)

In [None]:
scores[np.argmax(scores)]

In [None]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

In [None]:
# The best model is found at a certain random state

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [None]:
scores=[]
for i in range(100):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr_lasso=Lasso()
    pipe=make_pipeline(column_trans,lr_lasso)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [None]:
# The best model is found at a certain random state

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr_lasso=Lasso()
pipe=make_pipeline(column_trans,lr_lasso)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)