In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
car = pd.read_csv('quikr_car.csv')

In [None]:
car.head(5)

In [None]:
car.info()

# Defects
### Objects should be converted to their respective datatype
### data should be made approriate in kms_driven, year and Price
### Fuel type has nan values
### Name is not consistent

# Cleaning

In [None]:
Backup = car.copy()

In [None]:
# Making year attribute clean
# Only year values are there, values which were not year are removed
car = car[car['year'].str.isnumeric()]
car['year'] = car['year'].astype(int)

In [None]:
car = car[car['Price'] != "Ask For Price"]  # Ask For Price value rows are removed
car['Price'] = car['Price'].str.replace(',', '').astype(int)  # Comma removed from price and integer datatype changed

In [None]:
car = car[car['kms_driven'] != "Petrol"]
car['kms_driven'] = car['kms_driven'].str.split(" ").str.get(0).str.replace(',', '').astype(int)

In [None]:
# Fuel type which has nan values are removed
car = car[~car['fuel_type'].isna()]

In [None]:
car['name'] = car['name'].str.split(" ").str.slice(0, 3).str.join(" ")

In [None]:
car.describe()

In [None]:
car[car['Price']>6e6] # This is an outliar, so we need to remove it

In [None]:
car = car[car['Price']<6e6]

In [None]:
car = car.reset_index(drop=True)

In [None]:
car.to_csv("Cleaned Data.csv")

# Model

In [None]:
X = car.drop(columns="Price")
y = car['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=661)

In [None]:
ohe = OneHotEncoder()
ohe.fit(X[['name', 'company', 'fuel_type']])

In [None]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']), remainder='passthrough')

In [None]:
car

In [None]:
lr = LinearRegression()

In [None]:
pipe = make_pipeline(column_trans, lr)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
import pickle
pickle.dump(pipe, open('LinearRegressionModel.pkl', 'wb'))

In [None]:
pipe.predict(pd.DataFrame([["Maruti Suzuki Swift", "Maruti", 2019, 100, "Petrol"]], columns=['name', 'company', 'year', 'kms_driven', 'fuel_type']))