In [1]:
import pandas as pd
import numpy as np

# Dataset

In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [34]:
#df = pd.read_csv(r'data.csv')
#df.head()

df = pd.read_csv(r'C:\Users\Faezeh\Downloads\car-price_data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Data Preparation

In [4]:
# Required features
features = [
    'Make',
    'Model',
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'Transmission Type',
    'Vehicle Style',
    'highway MPG',
    'city mpg',
    'MSRP'
]

# Selecting the required features
data = df[features].copy()

In [5]:
# Replace spaces with underscores in column names
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [6]:
# missing values in statistics
data.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [7]:
# Fill in the missing missing values in data with 0
data.fillna(0, inplace=True)

In [8]:
data.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [9]:
# Rename the column 'msrp' to 'price'
data = data.rename(columns={'msrp': 'price'})

In [10]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# Q1

In [11]:
# most frequent observation for the transmission_type
data['transmission_type'].mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

In [12]:
data['transmission_type'].mode().values[0]

'AUTOMATIC'

# Q2

### Q2.1 Features with the highest correlation

In [13]:
numeric_cols = data.columns[data.dtypes != 'object']
correlation_matrix = data[numeric_cols].corr().round(4)

In [14]:
correlation_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.3387,-0.0407,0.2582,0.1982,0.2276
engine_hp,0.3387,1.0,0.7749,-0.4157,-0.4249,0.6501
engine_cylinders,-0.0407,0.7749,1.0,-0.6145,-0.5873,0.5263
highway_mpg,0.2582,-0.4157,-0.6145,1.0,0.8868,-0.16
city_mpg,0.1982,-0.4249,-0.5873,0.8868,1.0,-0.1577
price,0.2276,0.6501,0.5263,-0.16,-0.1577,1.0


In [15]:
abs_correlation_matrix = correlation_matrix.abs() # Gets the absolute values of the correlation matrix

In [16]:
# Creates a mask to exclude the diagonal and upper triangular values
mask = np.triu(np.ones_like(abs_correlation_matrix, dtype=bool))

# Sets the upper triangle and main diagonal of the abs_correlation_matrix to 0 using 'mask'
abs_correlation_matrix = abs_correlation_matrix.mask(mask)

# Find the features with the highest correlation
max_correlation_indices = abs_correlation_matrix.stack().idxmax()

max_correlation_indices

('city_mpg', 'highway_mpg')

In [17]:
# Convert the index to row and column names
feature1 = max_correlation_indices[0]
feature2 = max_correlation_indices[1]
feature1, feature2

('city_mpg', 'highway_mpg')

### Make 'price' binary

In [18]:
# create a column 'above_average' that contains 1 if the price is above the mean price and 0 otherwise
data['above_average'] = (data['price'] > data['price'].mean()).astype(int)
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


### Split the data

In [19]:
# Split the data in train/val/test sets with 60%/20%/20% distribution using the train_test_split function with seed 42
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42) # test is 20% of the whole data
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) # val is 25% of the rest (which is 80% of the whole data)

len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [20]:
# Make sure that the target value (above_average) is not in the dataframes
y_train = df_train['above_average']
y_val = df_val['above_average']
y_test = df_test['above_average']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

# Q3

In [21]:
# Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
from sklearn.metrics import mutual_info_score

categorical_cols = df_train.columns[df_train.dtypes == 'object']

mutual_info_scores = {}

for col in categorical_cols:
    mutual_info_scores[col] = mutual_info_score(df_train[col], y_train).round(2)

mutual_info_scores

{'make': 0.24, 'model': 0.46, 'transmission_type': 0.02, 'vehicle_style': 0.08}

In [22]:
# Which feature has the lowest mutual_info_scores?
max_score = np.inf
feature = ''
for key, value in mutual_info_scores.items():
    if value < max_score:
        max_score = value
        feature = key
feature, max_score

('transmission_type', 0.02)

# Q4

In [23]:
# one-hot encoding the categorical features in the training, validation, and test sets
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder object
one_hot_encoder = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
categorical_features = df_train.columns[df_train.dtypes == 'object']
one_hot_encoder_fit = one_hot_encoder.fit(data[categorical_features])

# Transform the categorical features in the train, validation, and test sets
one_hot_encoded = one_hot_encoder_fit.transform(df_train[categorical_features])
df_train_onehot = pd.concat([df_train,one_hot_encoded],axis=1).drop(columns=categorical_features)

one_hot_encoded = one_hot_encoder_fit.transform(df_val[categorical_features])
df_val_onehot = pd.concat([df_val,one_hot_encoded],axis=1).drop(columns=categorical_features)

one_hot_encoded = one_hot_encoder_fit.transform(df_test[categorical_features])
df_test_onehot = pd.concat([df_test,one_hot_encoded],axis=1).drop(columns=categorical_features)

df_train_onehot.head()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,...,vehicle_style_Convertible,vehicle_style_Convertible SUV,vehicle_style_Coupe,vehicle_style_Crew Cab Pickup,vehicle_style_Extended Cab Pickup,vehicle_style_Passenger Minivan,vehicle_style_Passenger Van,vehicle_style_Regular Cab Pickup,vehicle_style_Sedan,vehicle_style_Wagon
3972,2011,225.0,6.0,19,15,33599,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,2009,276.0,6.0,21,17,26245,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5216,2012,570.0,10.0,20,12,248000,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2805,2016,200.0,4.0,27,20,24990,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11369,2009,158.0,4.0,26,20,20475,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# deleting the price column from the dataframes before training a logistic regression model (otherwise it is cheating and it will lead to 100% accuracy)

# creating a copy with the column 'price' for question 6
df_train_onehot_with_price = df_train_onehot.copy()
df_val_onehot_with_price = df_val_onehot.copy()
df_test_onehot_with_price = df_test_onehot.copy()

# deleting the 'price' column
del df_train_onehot['price']
del df_val_onehot['price']
del df_test_onehot['price']

In [25]:
# Train a logistic regression model using the train and validation datasets
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(df_train_onehot, y_train)

In [26]:
# Calculate the accuracy on the validation dataset and rount it to 2 decimal digits
from sklearn.metrics import accuracy_score
y_pred = model.predict(df_val_onehot)
acc_orig = accuracy_score(y_val, y_pred)
round(acc_orig, 2)

0.95

# Q5

In [27]:
features_to_drop = [
    'year',
    'engine_hp',
    'transmission_type',
    'city_mpg'
]

In [28]:
# training a model without each of these features and calculate the accuracy for each model
acc_drop = {}

for feature_to_drop in features_to_drop:
    feature_to_drop_list = df_train_onehot.columns[df_train_onehot.columns.str.startswith(feature_to_drop)].tolist()

    df_train_onehot_drop = df_train_onehot.drop(columns=feature_to_drop_list).copy()
    df_val_onehot_drop = df_val_onehot.drop(columns=feature_to_drop_list).copy()

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(df_train_onehot_drop, y_train)
    y_pred = model.predict(df_val_onehot_drop)
    acc_drop[feature_to_drop] = accuracy_score(y_val, y_pred)

acc_drop 

{'year': 0.9483843894250944,
 'engine_hp': 0.9185900125891733,
 'transmission_type': 0.9458665547629039,
 'city_mpg': 0.9458665547629039}

In [29]:
# calculating the absolute difference between the original accuracy and the accuracies after dropping each of the mentioned features
acc_drop_diff = acc_drop.copy()
feature_with_min_difference, min_difference = '', np.inf
for key, value in acc_drop_diff.items():
    acc_drop_diff[key] = np.abs(value - acc_orig)
    
    # finding the feature with the minimum difference
    if acc_drop_diff[key] < min_difference:
        min_difference = acc_drop_diff[key]
        feature_with_min_difference = key

display(acc_drop_diff)
display(feature_with_min_difference, min_difference)

{'year': 0.003357112882920621,
 'engine_hp': 0.0264372639530005,
 'transmission_type': 0.0008392782207301552,
 'city_mpg': 0.0008392782207301552}

'transmission_type'

0.0008392782207301552

# Q6

In [30]:
# retrieving the dataframes with the price column
df_train_onehot = df_train_onehot_with_price.copy()
df_val_onehot = df_val_onehot_with_price.copy()
df_test_onehot = df_test_onehot_with_price.copy()

In [31]:
# creating the target variables, apply logarithmic transformation, and deleting the price column from the dataframes
y_train = np.log(df_train_onehot['price'])
y_val = np.log(df_val_onehot['price'])
y_test = np.log(df_test_onehot['price'])

del df_train_onehot['price']
del df_val_onehot['price']
del df_test_onehot['price']

In [32]:
# training a linear regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

alphas = [0, 0.01, 0.1, 1, 10]
rmses = {}
rmse_min = np.inf

for alpha in alphas:
    print(f'alpha = {alpha} ...')

    # train a linear regression model with parameter alpha
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    # fit the model on the training dataset
    model.fit(df_train_onehot, y_train)

    # calculating the RMSE on the validation dataset
    y_pred = model.predict(df_val_onehot)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    rmses[alpha] = round(rmse, 3) # saving the rmse for each alpha

rmses


alpha = 0 ...




alpha = 0.01 ...




alpha = 0.1 ...




alpha = 1 ...




alpha = 10 ...




{0: 0.487, 0.01: 0.487, 0.1: 0.487, 1: 0.487, 10: 0.487}

In [33]:
# finding the best alpha with minimum RMSE
min_rmse, best_alpha = np.inf, 0
for key, value in rmses.items():
    if value < min_rmse:
        min_rmse = value
        best_alpha = key

best_alpha

0