In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Importing the dataset

In [2]:
df = pd.read_csv("merged_car_data.csv")

In [3]:
df.shape

(539126, 8)

In [4]:
df.head()

Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model
0,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
1,16895,2015,21026,Souderton,PA,KL4CJASB9FB190502,Buick,EncoreFWD
2,15995,2015,20496,McDonough,GA,KL4CJBSB2FB101567,Buick,EncoreConvenience
3,17991,2016,14806,Tucson,AZ,KL4CJASB7GB620898,Buick,EncoreFWD
4,3950,2003,176935,Elkhart,IN,3G5DA03E93S607122,Buick,RendezvousCXL


Feature Engineering for model prediction
>We are frequency encoding the columns and converting the categorical columns into numerical encoding for model prediction

In [5]:
cat_cols = ['Mileage', 'City', 'State','Make','Model']

# perform frequency encoding on each column
for col in cat_cols:
    freq_encoding = df[col].value_counts(normalize=True)
    df[f'{col}_freq_encoded'] = df[col].map(freq_encoding)

In [6]:
df.head()

Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model,Mileage_freq_encoded,City_freq_encoded,State_freq_encoded,Make_freq_encoded,Model_freq_encoded
0,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,1.7e-05,0.000479,0.010339,0.014197,0.000692
1,16895,2015,21026,Souderton,PA,KL4CJASB9FB190502,Buick,EncoreFWD,1.7e-05,0.000252,0.032072,0.014197,0.000692
2,15995,2015,20496,McDonough,GA,KL4CJBSB2FB101567,Buick,EncoreConvenience,2e-05,0.000658,0.044923,0.014197,0.000614
3,17991,2016,14806,Tucson,AZ,KL4CJASB7GB620898,Buick,EncoreFWD,7e-06,0.005112,0.027509,0.014197,0.000692
4,3950,2003,176935,Elkhart,IN,3G5DA03E93S607122,Buick,RendezvousCXL,4e-06,0.000746,0.017738,0.014197,9e-06


Droping all the columns that are not necessary for the model prediction

In [7]:
df.drop(columns=['City', 'State','Make','Model','Vin'],inplace=True)

Creating new features and columns 
>Creating year column : how much is the car driven
>avg_mileage_per_year : mileage of the car with respect to year

In [5]:
import datetime

current_year = datetime.datetime.now().year
df['Years_Ago'] = current_year - df['Year']

In [6]:
df['Avg_Mileage_Per_Year'] = df['Mileage'] / df['Years_Ago']

In [10]:
df.head()

Unnamed: 0,Price,Year,Mileage,Mileage_freq_encoded,City_freq_encoded,State_freq_encoded,Make_freq_encoded,Model_freq_encoded,Years_Ago,Avg_Mileage_Per_Year
0,15777,2015,25195,1.7e-05,0.000479,0.010339,0.014197,0.000692,8,3149.375
1,16895,2015,21026,1.7e-05,0.000252,0.032072,0.014197,0.000692,8,2628.25
2,15995,2015,20496,2e-05,0.000658,0.044923,0.014197,0.000614,8,2562.0
3,17991,2016,14806,7e-06,0.005112,0.027509,0.014197,0.000692,7,2115.142857
4,3950,2003,176935,4e-06,0.000746,0.017738,0.014197,9e-06,20,8846.75


Checkinh the correlation matrix of the new features

In [11]:
df.corr()

Unnamed: 0,Price,Year,Mileage,Mileage_freq_encoded,City_freq_encoded,State_freq_encoded,Make_freq_encoded,Model_freq_encoded,Years_Ago,Avg_Mileage_Per_Year
Price,1.0,0.415153,-0.435329,0.094281,0.007772,0.038477,-0.075001,0.065202,-0.415153,-0.379339
Year,0.415153,1.0,-0.766522,0.143218,0.010739,0.053831,0.028228,0.010462,-1.0,-0.450445
Mileage,-0.435329,-0.766522,1.0,-0.174414,-0.00541,-0.049662,0.041204,0.019039,0.766522,0.879893
Mileage_freq_encoded,0.094281,0.143218,-0.174414,1.0,-0.000645,0.009725,-0.009937,-0.008617,-0.143218,-0.179513
City_freq_encoded,0.007772,0.010739,-0.00541,-0.000645,1.0,0.304702,-0.031554,-0.023084,-0.010739,-0.000779
State_freq_encoded,0.038477,0.053831,-0.049662,0.009725,0.304702,1.0,-0.029272,-0.020618,-0.053831,-0.036278
Make_freq_encoded,-0.075001,0.028228,0.041204,-0.009937,-0.031554,-0.029272,1.0,0.235334,-0.028228,0.081
Model_freq_encoded,0.065202,0.010462,0.019039,-0.008617,-0.023084,-0.020618,0.235334,1.0,-0.010462,0.036305
Years_Ago,-0.415153,-1.0,0.766522,-0.143218,-0.010739,-0.053831,-0.028228,-0.010462,1.0,0.450445
Avg_Mileage_Per_Year,-0.379339,-0.450445,0.879893,-0.179513,-0.000779,-0.036278,0.081,0.036305,0.450445,1.0


Spliting the dataset into x_train,x_test in a ratio of 80% - 20%

In [12]:
from sklearn.model_selection import train_test_split
x = df[['Year','Mileage','Mileage_freq_encoded','City_freq_encoded','State_freq_encoded','Make_freq_encoded','Model_freq_encoded','Years_Ago','Avg_Mileage_Per_Year']]
y = df['Price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
x_train

Unnamed: 0,Year,Mileage,Mileage_freq_encoded,City_freq_encoded,State_freq_encoded,Make_freq_encoded,Model_freq_encoded,Years_Ago,Avg_Mileage_Per_Year
534355,2014,16523,0.000011,0.000527,0.011446,0.027290,0.003658,9,1835.888889
245288,2014,21005,0.000026,0.001475,0.097747,0.016108,0.003016,9,2333.888889
30990,2014,19005,0.000017,0.008147,0.017055,0.116168,0.013648,9,2111.666667
72078,2005,125280,0.000009,0.000089,0.026573,0.041881,0.000386,18,6960.000000
465651,2013,31340,0.000028,0.002782,0.003981,0.057130,0.012923,10,3134.000000
...,...,...,...,...,...,...,...,...,...
110268,2012,90104,0.000006,0.000137,0.019943,0.120764,0.029325,11,8191.272727
259178,2017,8432,0.000015,0.000939,0.097747,0.031137,0.000288,6,1405.333333
365838,2014,36603,0.000017,0.000108,0.092157,0.057130,0.002645,9,4067.000000
131932,2015,72141,0.000002,0.003125,0.027179,0.120764,0.002281,8,9017.625000


In [14]:
y_train

534355    15991
245288    13499
30990     32500
72078      8995
465651    13995
          ...  
110268    19944
259178    36525
365838    15988
131932    11980
121958    20300
Name: Price, Length: 431300, dtype: int64

Using minmaxscaler to convert all the larger values and bringing them into a ascale of 0-1 for model prediction

In [15]:
from sklearn.preprocessing import MinMaxScaler
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Scale the features using fit_transform() method
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [16]:
X_train_scaled[0]

array([0.80952381, 0.00578323, 0.01096491, 0.03213353, 0.09949033,
       0.22596996, 0.12467582, 0.19047619, 0.00746799])

Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import cross_val_score

In [18]:
# Create linear regression model
lr_model = LinearRegression()

# Train model on training set
lr_model.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = lr_model.predict(X_test_scaled)

# Calculate root mean squared error on test set
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("R2:",r2)
print("Mae:",mae)
print("Mse:",mse)
print("RMSE:", rmse)

R2: 0.234116704066608
Mae: 7454.011453638269
Mse: 144711770.31146476
RMSE: 12029.620538964009


Regid Model

In [19]:
from sklearn.linear_model import Ridge

# Create ridge regression model
ridge_model = Ridge(alpha=1.0)

# Train model on training set
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = ridge_model.predict(X_test_scaled)

# Calculate R2 and MSE on test set
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("R2:", r2)
print("MAE:", mae)
print("MSE:", mse)

R2: 0.23352679302390833
MAE: 7454.011453638269
MSE: 144823232.5561287


Random Forest Regression model With PCA 

In [23]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_train_scaled)

# Train random forest model
rf_model = RandomForestRegressor(n_estimators=40, random_state=20)
rf_model.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)

print("R2:", r2)
print("MSE:", mse)
print("MAE:", mae)
print("MedAE:", medae)


NameError: name 'RandomForestRegressor' is not defined

A deep learning ANN model 

In [20]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
tf.random.set_seed(42)

model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(100,activation="relu"),
    tf.keras.layers.Dense(10,activation="relu"),
    tf.keras.layers.Dense(1)
])

model_3.compile(loss = tf.keras.losses.MAE,optimizer=tf.keras.optimizers.Adam(lr=0.1),metrics=['mae'])

history = model_3.fit(X_train_scaled,y_train,epochs=15)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [21]:
# Create a DataFrame to store the accuracy scores
df_scores = pd.DataFrame({'Model': ['Linear Regression', 'Ridge Regression', 'Random Forest'],
                          'R2':[0.29913302194353664, 0.12978522204812015, 0.29911531692534155],
                         "MAE":[ 4131.940411928735,2420.1356765134137, 2420.1356765134137],
                         "MSE":[0.882059874974617,160617726.43756232, 22284497.355930045],
                   })

In [22]:
df_scores

Unnamed: 0,Model,R2,MAE,MSE
0,Linear Regression,0.299133,4131.940412,0.8820599
1,Ridge Regression,0.129785,2420.135677,160617700.0
2,Random Forest,0.299115,2420.135677,22284500.0


In [28]:
df.head()

Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model
0,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
1,16895,2015,21026,Souderton,PA,KL4CJASB9FB190502,Buick,EncoreFWD
2,15995,2015,20496,McDonough,GA,KL4CJBSB2FB101567,Buick,EncoreConvenience
3,17991,2016,14806,Tucson,AZ,KL4CJASB7GB620898,Buick,EncoreFWD
4,3950,2003,176935,Elkhart,IN,3G5DA03E93S607122,Buick,RendezvousCXL


In [7]:
df.drop(columns=['Vin','Make'],inplace=True)

In [8]:
df.to_csv('modeling.csv', index=False)

In [9]:
df1 = pd.read_csv("modeling.csv")
df1.head()

Unnamed: 0,Price,Year,Mileage,City,State,Model,Years_Ago,Avg_Mileage_Per_Year
0,15777,2015,25195,New Orleans,LA,EncoreFWD,8,3149.375
1,16895,2015,21026,Souderton,PA,EncoreFWD,8,2628.25
2,15995,2015,20496,McDonough,GA,EncoreConvenience,8,2562.0
3,17991,2016,14806,Tucson,AZ,EncoreFWD,7,2115.142857
4,3950,2003,176935,Elkhart,IN,RendezvousCXL,20,8846.75


In [10]:
df1.shape

(539126, 8)