In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [2]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
columns_to_use = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"]
df_sorted = df[columns_to_use]

In [6]:
 df_sorted.columns = df_sorted.columns.str.lower().str.replace(" ", "_")

In [7]:
df_sorted = df_sorted.rename({"msrp" : "price"}, axis="columns")

In [8]:
df_sorted.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [9]:
df_sorted = df_sorted.fillna(0)

In [10]:
df_sorted.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [11]:
df_sorted["transmission_type"].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

In [12]:
above_average = df_sorted["price"] > df_sorted["price"].mean()

In [13]:
df_sorted.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [14]:
df_with_price = df_sorted.copy()

In [15]:
df_sorted["price"] = above_average.astype(int)
df_sorted = df_sorted.rename({"price" : "above_average"}, axis="columns")

In [16]:
df_full_train, df_test = train_test_split(df_sorted, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_sorted, test_size=0.25, random_state=42)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [19]:
df_train.corr()

  df_train.corr()


Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.33867,-0.038898,0.257188,0.202336
engine_hp,0.33867,1.0,0.776203,-0.410014,-0.433273
engine_cylinders,-0.038898,0.776203,1.0,-0.604883,-0.594358
highway_mpg,0.257188,-0.410014,-0.604883,1.0,0.867513
city_mpg,0.202336,-0.433273,-0.594358,0.867513,1.0


In [20]:
categorical_columns = df_train.select_dtypes(include=['object']).columns
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [21]:
categorical_columns

Index(['make', 'model', 'transmission_type', 'vehicle_style'], dtype='object')

In [22]:
mi = df_train[categorical_columns].apply(mutual_info_churn_score)
round(mi.sort_values(ascending=False), 2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

In [23]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [24]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [25]:
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_val)

In [27]:
original_accuracy = round((y_pred == y_val).mean(), 2)

In [28]:
original_accuracy

0.93

In [29]:
columns_to_remove = list(df_train.columns)

for item in columns_to_remove:
  print("****************************")
  print(item)
  new_columns = columns_to_remove.copy()
  new_columns.remove(item)

  train_dict = df_train[new_columns].to_dict(orient='records')
  X_train = dv.transform(train_dict)

  val_dict = df_val[new_columns].to_dict(orient='records')
  X_val = dv.transform(val_dict)

  model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_val)
  accuracy = round((y_pred == y_val).mean(), 2)
  print(accuracy)


****************************
make
0.95
****************************
model
0.92
****************************
year
0.95
****************************
engine_hp
0.94
****************************
engine_cylinders
0.95
****************************
transmission_type
0.95
****************************
vehicle_style
0.94
****************************
highway_mpg
0.94
****************************
city_mpg
0.95


In [30]:
df_with_price.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [31]:
df_with_price.price = np.log1p(df_with_price.price)

In [32]:
df_with_price.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744


In [33]:
df_full_train_price, df_test_price = train_test_split(df_with_price, test_size=0.2, random_state=42)
df_train_price, df_val_price = train_test_split(df_with_price, test_size=0.25, random_state=42)

In [34]:
y_train_price = df_train_price.price.values
y_val_price = df_val_price.price.values
y_test_price = df_test_price.price.values

del df_train_price["price"]
del df_val_price["price"]
del df_test_price["price"]


In [35]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
alpha = [0, 0.01, 0.1, 1, 10]
dv = DictVectorizer(sparse=False)
train_dict = df_train_price.to_dict(orient='records')
X_train_price = dv.fit_transform(train_dict)

val_dict = df_val_price.to_dict(orient='records')
X_val_price = dv.transform(val_dict)
for a in alpha:
  print("Debug")
  ridge_model = Ridge(solver="sag", alpha=a)
  ridge_model.fit(X_train_price, y_train_price)
  y_val_pred_price = ridge_model.predict(X_val_price)
  accuracy = round(rmse(y_val_price, y_val_pred_price), 3)
  print(f"Alpha {a} : rmse {accuracy}")

Debug




Alpha 0 : rmse 0.481
Debug




Alpha 0.01 : rmse 0.481
Debug




Alpha 0.1 : rmse 0.481
Debug




Alpha 1 : rmse 0.481
Debug
