In [None]:
pip install category_encoders

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install scikit-optimize

# Data Preprocessing

In [None]:
import pandas as pd

df = pd.read_csv("Your folder path")

In [None]:
df.isnull().sum()

In [None]:
df['transmission'].fillna('automatic', inplace=True)

In [None]:
df = df.dropna(subset=['vin', 'state','sellingprice','saledate','make', 'model', 'trim','body','condition','odometer','color','interior','mmr'])

In [None]:
# Select categorical columns 
categorical_columns = ['make','model','trim','body','transmission','vin','state','color','interior','seller','saledate']

In [None]:
from category_encoders import TargetEncoder
# Create a copy of the dataframe
df_encoded = df.copy()

# Apply Target Encoding
encoder = TargetEncoder(cols=categorical_columns)
df_encoded[categorical_columns] = encoder.fit_transform(df[categorical_columns], df['mmr'])


In [None]:
# Compute the correlation matrix
correlation_matrix = df_encoded.corr(method='pearson')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap with Categorical Variables")
plt.show()


In [None]:
#list and drop columns that are less related to the target based on my judgement
cols_to_drop = ['transmission','color','vin','interior','state','sellingprice','saledate']

#at the same time rename the columns so they are understandable.
df = df.drop(columns = cols_to_drop).rename(columns = {'year':'manufacture_year','make':'brand','model':'specific_model',
                                                      'trim':'additional_designation','vin':'Vehicle_Identification_Number','mmr':'estimated_value'})



In [None]:
df.head()

In [None]:
num_duplicates = df.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
Q1 = df['estimated_value'].quantile(0.25)
Q3 = df['estimated_value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['estimated_value'] < lower_bound) | (df['estimated_value'] > upper_bound)]

In [None]:
df[['estimated_value']].boxplot()
plt.title('Boxplot of estimated value')
plt.show()

In [None]:
df = df[(df['estimated_value'] >= lower_bound) & (df['estimated_value'] <= upper_bound)]

df[['estimated_value']].boxplot()
plt.title('Boxplot of MMR After Outlier Handling')
plt.show()

In [None]:
#Split to training and testing

from sklearn.model_selection import train_test_split

x = df.drop(columns = 'estimated_value')
y = df['estimated_value']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2,random_state=8)