In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

In [2]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# scaling and train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
# creating a model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [5]:
# evaluation on test data
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
from sklearn.metrics import classification_report,confusion_matrix

In [6]:
# reading data from csv file using pandas
# reading data which will be used for training
df = pd.read_csv("D:\\Manipal 19-23\\All Sems\\5th sem\\Data Science P.E\\train-data.csv",skiprows=0)
df = df.drop("sno",axis=1)
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2.65


In [7]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [8]:
df.isnull().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [9]:
# data cleaning
df = df.drop(['New_Price'],axis=1)
mean = np.floor(df["Seats"].mean())
df = df.fillna({"Seats":mean})
df = df.dropna(how='any')

In [10]:
dff = df
df.isnull().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [11]:
# after removing some rows there is some duplicate data 
# we are removing that
df = df.reset_index(drop=True)

In [12]:
# converting string data to float
for i in range(df.shape[0]):
    df.at[i, 'Company'] = df['Name'][i].split()[0]
    df.at[i, 'Mileage(km/kg)'] = df['Mileage'][i].split()[0]
    df.at[i, 'Engine(CC)'] = df['Engine'][i].split()[0]
    df.at[i, 'Power(bhp)'] = df['Power'][i].split()[0]

# converting to float :
df['Mileage(km/kg)'] = df['Mileage(km/kg)'].astype(float)
df['Engine(CC)'] = df['Engine(CC)'].astype(float)

In [13]:
# counting and locating the positions of power(bhp) which have null values.
count = 0
position = []
for i in range(0,df.shape[0]):
    if (df['Power(bhp)'][i] == 'null'):
        count = count +1
        position.append(i)
print(count)
print(position)

107
[76, 79, 89, 120, 143, 225, 243, 260, 305, 306, 384, 422, 426, 441, 470, 573, 629, 645, 646, 736, 745, 825, 911, 922, 930, 1064, 1139, 1149, 1267, 1314, 1339, 1381, 1412, 1547, 1570, 1641, 1664, 1849, 1991, 2045, 2120, 2154, 2252, 2256, 2294, 2330, 2356, 2380, 2428, 2437, 2484, 2488, 2514, 2564, 2582, 2619, 2624, 2871, 3013, 3041, 3084, 3169, 3227, 3269, 3417, 3494, 3509, 3565, 3604, 3614, 3621, 3645, 3709, 3776, 3857, 3873, 3905, 3974, 4051, 4054, 4323, 4326, 4598, 4677, 4681, 4711, 4797, 4853, 4867, 4919, 5029, 5083, 5191, 5388, 5400, 5420, 5491, 5495, 5609, 5717, 5721, 5823, 5835, 5855, 5887, 5905, 5947]


In [14]:
df = df.drop(df.index[position])
df = df.reset_index(drop=True)


In [15]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Company,Mileage(km/kg),Engine(CC),Power(bhp)
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75,Maruti,26.60,998.0,58.16
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.50,Hyundai,19.67,1582.0,126.2
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.50,Honda,18.20,1199.0,88.7
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.00,Maruti,20.77,1248.0,88.76
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74,Audi,15.20,1968.0,140.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5869,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,4.75,Maruti,28.40,1248.0,74
5870,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,4.00,Hyundai,24.40,1120.0,71
5871,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,2.90,Mahindra,14.00,2498.0,112
5872,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,2.65,Maruti,18.90,998.0,67.1


In [16]:
df.drop(["Name"],axis=1,inplace=True)
df.drop(["Mileage"],axis=1,inplace=True)
df.drop(["Engine"],axis=1,inplace=True)
df.drop(["Power"],axis=1,inplace=True)
df

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Company,Mileage(km/kg),Engine(CC),Power(bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,26.60,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.50,Hyundai,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.50,Honda,18.20,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.00,Maruti,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,15.20,1968.0,140.8
...,...,...,...,...,...,...,...,...,...,...,...,...
5869,Delhi,2014,27365,Diesel,Manual,First,5.0,4.75,Maruti,28.40,1248.0,74
5870,Jaipur,2015,100000,Diesel,Manual,First,5.0,4.00,Hyundai,24.40,1120.0,71
5871,Jaipur,2012,55000,Diesel,Manual,Second,8.0,2.90,Mahindra,14.00,2498.0,112
5872,Kolkata,2013,46000,Petrol,Manual,First,5.0,2.65,Maruti,18.90,998.0,67.1


In [17]:
# data training
X = df.drop(['Price'],axis=1)
y = df.Price
len(X)

5874

In [18]:
df

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Company,Mileage(km/kg),Engine(CC),Power(bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,26.60,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.50,Hyundai,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.50,Honda,18.20,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.00,Maruti,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,15.20,1968.0,140.8
...,...,...,...,...,...,...,...,...,...,...,...,...
5869,Delhi,2014,27365,Diesel,Manual,First,5.0,4.75,Maruti,28.40,1248.0,74
5870,Jaipur,2015,100000,Diesel,Manual,First,5.0,4.00,Hyundai,24.40,1120.0,71
5871,Jaipur,2012,55000,Diesel,Manual,Second,8.0,2.90,Mahindra,14.00,2498.0,112
5872,Kolkata,2013,46000,Petrol,Manual,First,5.0,2.65,Maruti,18.90,998.0,67.1


In [19]:
X = pd.get_dummies(X,columns=['Location','Year','Fuel_Type','Transmission','Owner_Type','Company'])
X.head()

Unnamed: 0,Kilometers_Driven,Seats,Mileage(km/kg),Engine(CC),Power(bhp),Location_Ahmedabad,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,...,Company_Mini,Company_Mitsubishi,Company_Nissan,Company_Porsche,Company_Renault,Company_Skoda,Company_Tata,Company_Toyota,Company_Volkswagen,Company_Volvo
0,72000,5.0,26.6,998.0,58.16,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,41000,5.0,19.67,1582.0,126.2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,46000,5.0,18.2,1199.0,88.7,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,87000,7.0,20.77,1248.0,88.76,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,40670,5.0,15.2,1968.0,140.8,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X.columns

Index(['Kilometers_Driven', 'Seats', 'Mileage(km/kg)', 'Engine(CC)',
       'Power(bhp)', 'Location_Ahmedabad', 'Location_Bangalore',
       'Location_Chennai', 'Location_Coimbatore', 'Location_Delhi',
       'Location_Hyderabad', 'Location_Jaipur', 'Location_Kochi',
       'Location_Kolkata', 'Location_Mumbai', 'Location_Pune', 'Year_1998',
       'Year_1999', 'Year_2000', 'Year_2001', 'Year_2002', 'Year_2003',
       'Year_2004', 'Year_2005', 'Year_2006', 'Year_2007', 'Year_2008',
       'Year_2009', 'Year_2010', 'Year_2011', 'Year_2012', 'Year_2013',
       'Year_2014', 'Year_2015', 'Year_2016', 'Year_2017', 'Year_2018',
       'Year_2019', 'Fuel_Type_CNG', 'Fuel_Type_Diesel', 'Fuel_Type_LPG',
       'Fuel_Type_Petrol', 'Transmission_Automatic', 'Transmission_Manual',
       'Owner_Type_First', 'Owner_Type_Fourth & Above', 'Owner_Type_Second',
       'Owner_Type_Third', 'Company_Ambassador', 'Company_Audi', 'Company_BMW',
       'Company_Bentley', 'Company_Chevrolet', 'Company_Datsu

In [21]:
# checking shape of    x and y
print(X.shape)
print(y.shape)

(5874, 78)
(5874,)


In [22]:
# training data by using sklearn train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
len(X_train)

4699

In [23]:
# data scaling 
# everything is scaled b/w 0 to 1 . to prevent data leakage
scaler = MinMaxScaler()

# fit and transfrom
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# everything has been scaled between 1 and 0
print('Max: ',X_train.max())
print('Min: ', X_train.min())

Max:  1.0
Min:  0.0


In [24]:
model = Sequential()

# input layer
model.add(Dense(19,activation='relu'))

# hidden layers
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

# output layer
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

In [25]:
model.fit(x=X_train,y=y_train.values,
          validation_data=(X_test,y_test.values),
          batch_size=128,epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1fea3a45fd0>

In [26]:
# predictions on the test set
predictions = model.predict(X_test)

print('MAE: ',mean_absolute_error(y_test,predictions))
print('MSE: ',mean_squared_error(y_test,predictions))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,predictions)))
print('Variance Regression Score: ',explained_variance_score(y_test,predictions))

print('\n\nDescriptive Statistics:\n',df['Price'].describe())

MAE:  1.9153337222119595
MSE:  23.236144135450566
RMSE:  4.820388380146413
Variance Regression Score:  0.8456207471322168


Descriptive Statistics:
 count    5874.000000
mean        9.601551
std        11.248283
min         0.440000
25%         3.512500
50%         5.750000
75%        10.000000
max       160.000000
Name: Price, dtype: float64


In [27]:
predictions

array([[24.173319 ],
       [19.03646  ],
       [45.437195 ],
       ...,
       [ 7.540623 ],
       [ 7.052656 ],
       [ 1.6346818]], dtype=float32)

In [28]:
print("Features of car :")
print(Test_data)
Test_data = scaler.transform(Test_data.values.reshape(-1, 78))
print('\nPrediction Price:',model.predict(Test_data)[0,0])


Features of car :


NameError: name 'Test_data' is not defined