In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing the Dataest
df=pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,CAR No,mileage,engine,max_power,seats,year,km_driven,fuel,seller_type,transmission,owner,Loan_Amount
0,160001,23.4 kmpl,1248 CC,74 bhp,5.0,2014,145500,Diesel,Individual,Manual,First Owner,450000
1,160002,21.14 kmpl,1498 CC,103.52 bhp,5.0,2014,120000,Diesel,Individual,Manual,Second Owner,370000
2,160003,17.7 kmpl,1497 CC,78 bhp,5.0,2006,140000,Petrol,Individual,Manual,Third Owner,158000
3,160004,23.0 kmpl,1396 CC,90 bhp,5.0,2010,127000,Diesel,Individual,Manual,First Owner,225000
4,160005,16.1 kmpl,1298 CC,88.2 bhp,5.0,2007,120000,Petrol,Individual,Manual,First Owner,130000


# Drop unwanted Columns

In [3]:
df.drop('CAR No',axis=1,inplace=True)

In [4]:
# Checking for NUll Values
df.isnull().sum()

mileage         221
engine          221
max_power       215
seats           221
year              0
km_driven         0
fuel              0
seller_type       0
transmission      0
owner             0
Loan_Amount       0
dtype: int64

# Filling Null Values with "Mode"

In [5]:
#handling null values using simple imputer
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(df.iloc[:, :4])
df.iloc[:, :4] = imputer.transform(df.iloc[:, :4])

# Removing the units for Engine, Max_power, Mileage

In [6]:
# KMPH for Mileage
df['mileage']=df['mileage'].apply(lambda x:float(x.split()[0]))

# CC for Engine
df['engine']=df['engine'].apply(lambda x:int(x.split()[0]))

# bhp for Max_Power
df['max_power']=df['max_power'].apply(lambda x:float(x.split()[0]))

# Encoding

In [7]:
df['seller_type'].value_counts()

Individual          6765
Dealer              1126
Trustmark Dealer     236
Name: seller_type, dtype: int64

In [8]:
df['seller_type']=df['seller_type'].map({'Individual':0,'Dealer':1,'Trustmark Dealer':2}).astype(int)

In [9]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         56
LPG         38
Name: fuel, dtype: int64

In [10]:
df['fuel']=df['fuel'].map({'Diesel':0,'Petrol':1,'CNG':2,'LPG':3}).astype(int)

In [11]:
df['transmission'].value_counts()

Manual       7077
Automatic    1050
Name: transmission, dtype: int64

In [12]:
df['transmission']=df['transmission'].map({'Manual':0,'Automatic':1}).astype(int)

In [13]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2104
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [14]:
df['owner']=df['owner'].map({'Test Drive Car':0,'First Owner':1,'Second Owner':2,'Third Owner':3,'Fourth & Above Owner':4}).astype(int)

In [15]:
#Extracting the Independent & Dependent Variables
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

# Spliting the Dataset

In [16]:
# Spliting Dataset into training & Testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

# RandomForestRegressor

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=25,random_state=2)
rf.fit(X_train, y_train) 

RandomForestRegressor(n_estimators=25, random_state=2)

# Saving the Model

In [18]:
import pickle
pickle.dump(rf, open("model.pkl", "wb"))

# Loading the Model

In [19]:
model = pickle.load(open("model.pkl", "rb"))
y_pred=model.predict(X_test)

In [20]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)
print("R2 Score for RandomForestRegressor is:",r2)

R2 Score for RandomForestRegressor is: 0.9457421020697617


In [21]:
from sklearn.metrics import mean_squared_error
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 191020.73432632213


In [22]:
y_pred

array([ 279000.  , 1280200.  ,  528279.96, ..., 3800000.  ,  176719.84,
        374639.92])

In [23]:
y_test[0]

290000