In [408]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [409]:
df = pd.read_csv('data.csv')

In [410]:
features = ['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']

In [411]:
df = df[features]

In [412]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [413]:
df.engine_hp = df.engine_hp.fillna(0)
df.engine_cylinders = df.engine_cylinders.fillna(0)

In [414]:
df.isna().any()

make                 False
model                False
year                 False
engine_hp            False
engine_cylinders     False
transmission_type    False
vehicle_style        False
highway_mpg          False
city_mpg             False
msrp                 False
dtype: bool

In [415]:
df = df.rename(columns={'msrp': 'price'})

In [416]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [417]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [418]:
# Q1
df.transmission_type.value_counts()

automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: transmission_type, dtype: int64

In [419]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [420]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [421]:
df[numerical].head()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
0,2011,335.0,6.0,26,19
1,2011,300.0,6.0,28,19
2,2011,300.0,6.0,28,20
3,2011,230.0,6.0,28,18
4,2011,230.0,6.0,28,18


In [422]:
# Q2
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [423]:
df.price.mean()

40594.737032063116

In [424]:
df['above_average'] = (df.price > df.price.mean()).astype(int)

In [425]:
from sklearn.model_selection import train_test_split

In [426]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [427]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [428]:
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [429]:
y_train_price = df_train.price.values
y_val_price = df_val.price.values
y_test_price = df_test.price.values

y_full_train = df_full_train.above_average.values
y_train_above_average = df_train.above_average.values
y_val_above_average = df_val.above_average.values
y_test_above_average = df_test.above_average.values

In [430]:
del df_full_train['price']
del df_train['price']
del df_val['price']
del df_test['price']

del df_full_train['above_average']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [431]:
df_full_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3181,cadillac,ct6,2016,265.0,4.0,automatic,sedan,31,22
5357,mercedes-benz,gls-class,2017,449.0,8.0,automatic,4dr_suv,18,14
4874,kia,forte,2016,173.0,4.0,automatic,coupe,34,25
8102,dodge,ram_250,1993,180.0,6.0,manual,regular_cab_pickup,16,11
10400,hyundai,tiburon,2008,172.0,6.0,automatic,2dr_hatchback,24,17


In [432]:
from sklearn.metrics import mutual_info_score

In [433]:
def mutual_info_above_avg_score(series):
    score = mutual_info_score(series, y_train_above_average)
    return round(score, 2)

In [434]:
# Q3
df_train[categorical].apply(mutual_info_above_avg_score)

make                 0.24
model                0.46
transmission_type    0.02
vehicle_style        0.08
dtype: float64

In [435]:
from sklearn.feature_extraction import DictVectorizer

In [436]:
dv = DictVectorizer(sparse=False)

In [437]:
full_train_dict = df_full_train[categorical].to_dict(orient='records')
X_full_train = dv.fit_transform(full_train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [438]:
from sklearn.linear_model import LogisticRegression

In [439]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_full_train, y_full_train)

In [440]:
y_pred = model.predict_proba(X_val)[:,1]

In [441]:
price_above_avg = (y_pred >= 0.5)

In [442]:
price_above_avg.astype(int)

array([0, 1, 0, ..., 0, 1, 1])

In [443]:
# Q4
accuracy = (price_above_avg == y_val_above_average).mean()
round(accuracy, 3)

0.944

In [444]:
# Q5
features = list(df_full_train.columns)
feature_impact = []

dv = DictVectorizer(sparse=False)
    
full_train_dict = df_full_train.to_dict(orient='records')
X_full_train = dv.fit_transform(full_train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
    
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_full_train, y_full_train)
    
y_pred = model.predict_proba(X_val)[:,1]
price_above_avg = (y_pred >= 0.5)
    
global_accuracy = (price_above_avg == y_val_above_average).mean()

for removed_feature in features:
    target_cats = [f for f in features if f != removed_feature]
    
    dv = DictVectorizer(sparse=False)
    
    full_train_dict = df_full_train[target_cats].to_dict(orient='records')
    X_full_train = dv.fit_transform(full_train_dict)

    val_dict = df_val[target_cats].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_full_train, y_full_train)
    
    y_pred = model.predict_proba(X_val)[:,1]
    price_above_avg = (y_pred >= 0.5)
    
    accuracy = (price_above_avg == y_val_above_average).mean()
    feature_impact.append((removed_feature, accuracy, accuracy - global_accuracy))
    
ranked_feature_impact = sorted(feature_impact, key=lambda x: abs(x[2]))

print(f"Global accuracy: {round(global_accuracy, 4)}")
print('Least impactful feature by rank:')
for (removed_feature, accuracy, diff) in ranked_feature_impact:
    print(f"Removing feature '{removed_feature}': accuracy={round(accuracy, 4)} | diff={round(diff, 4)}")

Global accuracy: 0.9543
Least impactful feature by rank:
Removing feature 'transmission_type': accuracy=0.9543 | diff=0.0
Removing feature 'engine_cylinders': accuracy=0.9522 | diff=-0.0021
Removing feature 'year': accuracy=0.9589 | diff=0.0046
Removing feature 'highway_mpg': accuracy=0.9417 | diff=-0.0126
Removing feature 'engine_hp': accuracy=0.9387 | diff=-0.0155
Removing feature 'vehicle_style': accuracy=0.9375 | diff=-0.0168
Removing feature 'make': accuracy=0.9324 | diff=-0.0218
Removing feature 'city_mpg': accuracy=0.9324 | diff=-0.0218
Removing feature 'model': accuracy=0.9236 | diff=-0.0306


In [445]:
from sklearn.linear_model import Ridge

In [446]:
df['price'] = np.log1p(df['price'])

In [447]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_test, y_train, y_val = df_test.price.values, df_train.price.values, df_val.price.values

In [448]:
del df_test['price']
del df_train['price']
del df_val['price']

In [453]:
dv = DictVectorizer(sparse=True)
    
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [455]:
# Q6
from sklearn.metrics import mean_squared_error

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(f"a: {a} rmse: {round(rmse, 3)}")

a: 0 rmse: 0.248
a: 0.01 rmse: 0.248
a: 0.1 rmse: 0.248
a: 1 rmse: 0.252
a: 10 rmse: 0.33
