**Setup**

In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [84]:
print('hello')

hello


In [85]:
# copy data into local data driectory
# !wget -O ./data/car_data.csv https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [86]:
# load data into pandas dataframe
df = pd.read_csv ('data/car_data.csv')
df.shape

(11914, 16)

In [87]:
# list original columns
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [88]:
# select only key columns
cols=['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 
      'Transmission Type', 'Vehicle Style','highway MPG', 'city mpg', 'MSRP']

df = df[cols]
df.shape

(11914, 10)

In [89]:
# format column names
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [90]:
# rename msrp to price
df.rename(columns={"msrp": "price"}, inplace=True)
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [91]:
df.shape

(11914, 10)

In [92]:
# fill missing values with 0
print('values before:\n', df.isnull().sum()) 
df = df.fillna(0)

values before:
 make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64


In [93]:
# make a copy of data in case required to be restored later
df_copy = df.copy()
df_copy.shape

(11914, 10)

**1: Most frequent value for `transmission_type`**

In [94]:
df['transmission_type'].value_counts().max()

8266

**2: Features with biggest correlation**

In [95]:
print('correlation engine_hp and year:', df.engine_hp.corr(df.year).round(4))
print('correlation engine_hp and engine_cylinders:', df.engine_hp.corr(df.engine_cylinders).round(4))
print('correlation highway_mpg and engine_cylinders:', df.highway_mpg.corr(df.engine_cylinders).round(4))
print('correlation highway_mpg and city_mpg:', df.highway_mpg.corr(df.city_mpg).round(4))

correlation engine_hp and year: 0.3387
correlation engine_hp and engine_cylinders: 0.7749
correlation highway_mpg and engine_cylinders: -0.6145
correlation highway_mpg and city_mpg: 0.8868


*Make price binary*

In [96]:
mean = df.price.mean()
mean

40594.737032063116

In [97]:
df['price'] = (df['price'] > mean).astype(int)

In [98]:
df.rename(columns={"price": "above_average"}, inplace=True)

*Split the data*

In [99]:
# setup train/validation/test split with Scikit-Learn
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [100]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [101]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [102]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

**3: Feature with the lowest mutual information score**

In [103]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
above_average          int64
dtype: object

In [104]:
cat = list(df.dtypes[df.dtypes == 'object'].index)
cat

['make', 'model', 'transmission_type', 'vehicle_style']

In [105]:
num = list(df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index)
num

['year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg',
 'above_average']

In [106]:
num.remove('above_average')
num

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [107]:
def mutual_info_price(series):
    return mutual_info_score(series, df.above_average)

In [108]:
mi = df[cat].apply(mutual_info_price)
mi.sort_values(ascending=False)

model                0.457469
make                 0.237731
vehicle_style        0.082633
transmission_type    0.019954
dtype: float64

**4: Accuracy of the model**

In [109]:
# standardise numerical values
X_train_num = df_train[num].values

#scaler = StandardScaler()
#scaler = MinMaxScaler()

# X_train_num = scaler.fit_transform(X_train_num)

In [110]:
df[cat].head()

Unnamed: 0,make,model,transmission_type,vehicle_style
0,BMW,1 Series M,MANUAL,Coupe
1,BMW,1 Series,MANUAL,Convertible
2,BMW,1 Series,MANUAL,Coupe
3,BMW,1 Series,MANUAL,Coupe
4,BMW,1 Series,MANUAL,Convertible


In [111]:
# perform one hot engoding on categorical values
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [112]:
X_train_cat = ohe.fit_transform(df_train[cat].values)



In [113]:
ohe.get_feature_names_out()

array(['x0_Acura', 'x0_Alfa Romeo', 'x0_Aston Martin', 'x0_Audi',
       'x0_BMW', 'x0_Bentley', 'x0_Bugatti', 'x0_Buick', 'x0_Cadillac',
       'x0_Chevrolet', 'x0_Chrysler', 'x0_Dodge', 'x0_FIAT', 'x0_Ferrari',
       'x0_Ford', 'x0_GMC', 'x0_Genesis', 'x0_HUMMER', 'x0_Honda',
       'x0_Hyundai', 'x0_Infiniti', 'x0_Kia', 'x0_Lamborghini',
       'x0_Land Rover', 'x0_Lexus', 'x0_Lincoln', 'x0_Lotus',
       'x0_Maserati', 'x0_Maybach', 'x0_Mazda', 'x0_McLaren',
       'x0_Mercedes-Benz', 'x0_Mitsubishi', 'x0_Nissan', 'x0_Oldsmobile',
       'x0_Plymouth', 'x0_Pontiac', 'x0_Porsche', 'x0_Rolls-Royce',
       'x0_Saab', 'x0_Scion', 'x0_Spyker', 'x0_Subaru', 'x0_Suzuki',
       'x0_Tesla', 'x0_Toyota', 'x0_Volkswagen', 'x0_Volvo',
       'x1_1 Series', 'x1_100', 'x1_124 Spider', 'x1_190-Class', 'x1_2',
       'x1_2 Series', 'x1_200', 'x1_200SX', 'x1_240', 'x1_240SX', 'x1_3',
       'x1_3 Series', 'x1_3 Series Gran Turismo', 'x1_300',
       'x1_300-Class', 'x1_3000GT', 'x1_300M', 'x1_30

In [114]:
# concatenate X_train_num and X_train_cat]
X_train = np.column_stack([X_train_num, X_train_cat])

In [115]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [116]:
#set up validation data
X_val_num = df_val[num].values
# X_val_num = scaler.transform(X_val_num)

X_val_cat = ohe.transform(df_val[cat].values)

X_val = np.column_stack([X_val_num, X_val_cat])

In [117]:
# calculate model accuracy
y_pred = model.predict_proba(X_val)[:, 1]
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5))
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5).round(2))


Accuracy:  0.9454469156525388
Accuracy:  0.95


**5: Feature selection - the smallest difference in accuracy**

Remove features and caclulate accuracy:
year,
engine_hp,
transmission_type,
city_mpg

In [118]:
X_train_full = X_train.copy()
X_val_full = X_val.copy()

*Remove year*

In [119]:
X_train = X_train_full
X_val = X_val_full

X_train = X_train[:,1:]
X_val = X_val[:,1:]

In [120]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [121]:
# calculate model accuracy
y_pred = model.predict_proba(X_val)[:, 1]
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5))

Accuracy:  0.9479647503147294


*Remove engine_hp*

In [122]:
X_train = X_train_full
X_val = X_val_full

#delete second column
X_train = np.delete(X_train,1,1)
X_val = np.delete(X_val,1,1)

In [123]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [124]:
# calculate model accuracy
y_pred = model.predict_proba(X_val)[:, 1]
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5))

Accuracy:  0.9232060428031893


*Remove transmission_type*

In [125]:
X_train = X_train_full
X_val = X_val_full

#delete second column
X_train = np.delete(X_train,slice(922,927),1)
X_val = np.delete(X_val,slice(922,927),1)

In [126]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [127]:
# calculate model accuracy
y_pred = model.predict_proba(X_val)[:, 1]
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5))

Accuracy:  0.9450272765421738


*Remove cty_mpg*

In [128]:
X_train = X_train_full
X_val = X_val_full

#delete second column
X_train = np.delete(X_train,4,1)
X_val = np.delete(X_val,4,1)

In [129]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [130]:
# calculate model accuracy
y_pred = model.predict_proba(X_val)[:, 1]
print('Accuracy: ', accuracy_score(y_val, y_pred >= 0.5))

Accuracy:  0.9324381032312211


**6: Regression with Scikit-Learn. What's the best alpha?**

In [136]:
df = df_copy

In [138]:
df.price = np.log(df.price)
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739327
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612754
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500949
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290449
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448715
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.739002
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.945000
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.832102
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.838011


*Split the data*

In [139]:
# setup train/validation/test split with Scikit-Learn
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [140]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [141]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [142]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [145]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739327
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612754
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500949
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290449
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448715


In [151]:
y_val.shape

(2383,)

*Setup numerical and catagorical features*

In [164]:
# standardise numerical values
X_train_num = df_train[num].values

scaler = StandardScaler()
#scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train_num)

In [153]:
df[cat].head()

Unnamed: 0,make,model,transmission_type,vehicle_style
0,BMW,1 Series M,MANUAL,Coupe
1,BMW,1 Series,MANUAL,Convertible
2,BMW,1 Series,MANUAL,Coupe
3,BMW,1 Series,MANUAL,Coupe
4,BMW,1 Series,MANUAL,Convertible


In [154]:
# perform one hot engoding on categorical values
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [155]:
X_train_cat = ohe.fit_transform(df_train[cat].values)



In [156]:
ohe.get_feature_names_out()

array(['x0_Acura', 'x0_Alfa Romeo', 'x0_Aston Martin', 'x0_Audi',
       'x0_BMW', 'x0_Bentley', 'x0_Bugatti', 'x0_Buick', 'x0_Cadillac',
       'x0_Chevrolet', 'x0_Chrysler', 'x0_Dodge', 'x0_FIAT', 'x0_Ferrari',
       'x0_Ford', 'x0_GMC', 'x0_Genesis', 'x0_HUMMER', 'x0_Honda',
       'x0_Hyundai', 'x0_Infiniti', 'x0_Kia', 'x0_Lamborghini',
       'x0_Land Rover', 'x0_Lexus', 'x0_Lincoln', 'x0_Lotus',
       'x0_Maserati', 'x0_Maybach', 'x0_Mazda', 'x0_McLaren',
       'x0_Mercedes-Benz', 'x0_Mitsubishi', 'x0_Nissan', 'x0_Oldsmobile',
       'x0_Plymouth', 'x0_Pontiac', 'x0_Porsche', 'x0_Rolls-Royce',
       'x0_Saab', 'x0_Scion', 'x0_Spyker', 'x0_Subaru', 'x0_Suzuki',
       'x0_Tesla', 'x0_Toyota', 'x0_Volkswagen', 'x0_Volvo',
       'x1_1 Series', 'x1_100', 'x1_124 Spider', 'x1_190-Class', 'x1_2',
       'x1_2 Series', 'x1_200', 'x1_200SX', 'x1_240', 'x1_240SX', 'x1_3',
       'x1_3 Series', 'x1_3 Series Gran Turismo', 'x1_300',
       'x1_300-Class', 'x1_3000GT', 'x1_300M', 'x1_30

In [166]:
X_train

array([[2.011e+03, 2.250e+02, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.009e+03, 2.760e+02, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.012e+03, 5.700e+02, 1.000e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [2.012e+03, 2.600e+02, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.993e+03, 1.360e+02, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.015e+03, 3.650e+02, 6.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [168]:
# train model
model = Ridge(alpha=1.0, solver = 'sag', random_state = 42, max_iter=1000)
model.fit(X_train, y_train)



In [159]:
#set up validation data
X_val_num = df_val[num].values
# X_val_num = scaler.transform(X_val_num)

X_val_cat = ohe.transform(df_val[cat].values)

X_val = np.column_stack([X_val_num, X_val_cat])

In [162]:
# Make predictions on the test data
y_pred = model.predict(X_val)

# Calculate the Mean Squared Error (MSE) to evaluate the model's performance
mse = mean_squared_error(y_val, y_pred)

# Print the MSE and the coefficients of the model
print("Mean Squared Error:", mse)
# print("Coefficients:", model.coef_)

Mean Squared Error: 0.2370558279070917
Coefficients: [ 9.11414255e-02  3.35818115e-03  8.42715539e-02 -4.19502863e-03
  1.08177513e-02  1.30422042e-02  4.03621784e-03  4.80542276e-02
  1.65442186e-02  1.87266582e-02  5.88640943e-02  3.38951093e-03
 -8.75134502e-03  2.60904882e-02 -9.92389449e-02  1.41583381e-02
 -1.04672372e-01 -9.70569500e-03  6.34421985e-02 -6.67487175e-02
 -4.30535528e-02 -2.11395063e-03  7.47510290e-03 -3.38933848e-02
 -3.60602891e-02  1.97292626e-02 -3.94168384e-02  3.17953902e-02
  3.35209218e-02  2.70868306e-02  1.88462805e-02  3.04369807e-02
  2.72365282e-02  1.50003258e-02  1.60876113e-02  1.22291915e-03
 -1.90466277e-02  1.56649568e-02 -1.18092995e-01 -1.21610454e-02
 -3.75249440e-02  4.40944739e-02  3.37118521e-02  3.81878998e-02
  1.80917815e-02 -4.53479896e-03  8.38838371e-03 -2.72769613e-03
  7.70943679e-02  2.97808243e-02 -1.19089700e-01  4.06461242e-02
 -4.36140771e-02  2.34291895e-03 -1.01031536e-02 -4.39730389e-04
 -1.20816784e-03 -4.46081560e-04 -2.8

In [163]:
# This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]

for i in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha = i, solver = 'sag', random_state = 42, max_iter=1000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    print("Mean Squared Error for alpha=",i, " : ",mean_squared_error(y_val, y_pred))



Mean Squared Error for alpha= 0  :  0.23703259164685897




Mean Squared Error for alpha= 0.01  :  0.23703282407406479




Mean Squared Error for alpha= 0.1  :  0.23703491585123188




Mean Squared Error for alpha= 1  :  0.2370558279070917
Mean Squared Error for alpha= 10  :  0.23726437907108822


