In [1]:
import numpy as np
import pandas as pd

### Data prep

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')

In [9]:
# MSRP is y
df = (df
      .loc[:, ['Make', 'Model', 'Year', 'Engine HP', 
               'Engine Cylinders', 'Transmission Type',
               'Vehicle Style', 'highway MPG', 'city mpg',
               'MSRP']]
      .fillna(0)
     )
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [10]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [13]:
df.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [27]:
df = df.rename(columns={'msrp': 'price'})

In [28]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


### Question 1

What is the most frequent observation (mode) for the column transmission_type?

In [29]:
df.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

**ANSWER: AUTOMATIC**

### Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [30]:
df.select_dtypes(include='number').corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


**ANSWER**: highway_mpg and city_mpg

Make price binary
- Now we need to turn the price variable from numeric into a binary format.
- Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [40]:
def make_price_binary(df: pd.DataFrame) -> pd.DataFrame:
    df.loc[df['price'] > df['price'].mean(), 'above_average'] = 1
    df.loc[df['price'] <= df['price'].mean(), 'above_average'] = 0
    return df
    
df = make_price_binary(df=df)

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [43]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)

In [44]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [38]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [45]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [46]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

### Question 3

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

Which of these variables has the lowest mutual information score?

In [47]:
from sklearn.metrics import mutual_info_score

In [59]:
categorical = df_train.select_dtypes(include=['object'])

In [64]:
def mutual_info_price_score(series):
    return mutual_info_score(series, y_train)

In [65]:
mi = (categorical
     .apply(mutual_info_price_score)
     .sort_values(ascending=False))
mi

model                0.462344
make                 0.239769
vehicle_style        0.084143
transmission_type    0.020958
dtype: float64

ANSWER: transmission_type has the lowest score.

### Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.60
- 0.72
- 0.84
- 0.95

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [97]:
train_dicts = df_train.drop(columns='price').to_dict(orient='records')
val_dicts = df_val.drop(columns='price').to_dict(orient='records')
test_dicts = df_test.drop(columns='price').to_dict(orient='records')

In [145]:
dv = DictVectorizer(sparse=True)

In [106]:
dv.fit(df.drop(columns='price').to_dict(orient='records'))

In [107]:
X_train = dv.transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [108]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [109]:
model.fit(X_train, y_train)

In [110]:
y_pred = model.predict(X_val)

In [111]:
round((y_pred == y_val).mean(), 2)

0.95

Answer: 0.95

### Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [127]:
def feature_elemination(feature: str) -> float:
    df_train_new = df_train.drop(columns=feature)
    df_val_new = df_val.drop(columns=feature)
    df_full_train_new = df_full_train.drop(columns=feature)
    
    train_dicts = df_train_new.drop(columns='price').to_dict(orient='records')
    val_dicts = df_val_new.drop(columns='price').to_dict(orient='records')
    full_train_dicts = df_full_train_new.drop(columns='price').to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False).fit(full_train_dicts)
    
    X_train = dv.transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    return (y_pred == y_val).mean()

In [133]:
0.95 - feature_elemination('year')

0.001195971464540424

In [134]:
0.95 - feature_elemination('engine_hp')

0.025115400755350348

In [135]:
0.95 - feature_elemination('transmission_type')

0.005392362568191311

In [136]:
0.95 - feature_elemination('city_mpg')

0.01756189676877884

ANSWER: year

### Question 6

- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

- 0
- 0.01
- 0.1
- 1
- 10

In [138]:
from sklearn.linear_model import Ridge

In [140]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

In [141]:
model = Ridge(solver='sag', random_state=42)

In [142]:
model.fit(X_train, y_train)



In [143]:
def rmse(y, y_pred):
    se = (y - y_pred ) ** 2
    mse = se.mean() 
    return np.sqrt(mse)

In [146]:
alpha_list = [0, 0.01, 0.1, 1, 10]

for alpha in alpha_list: 
    model = Ridge(solver='sag', random_state=42, alpha=alpha)
    model.fit(X_train, y_train)
    y_pred =  model.predict(X_val)
    rmse_score = round(rmse(y_val, y_pred), 3)
    print(f'RMSE for {alpha=}: {rmse_score}')



RMSE for alpha=0: 0.487




RMSE for alpha=0.01: 0.487




RMSE for alpha=0.1: 0.487




RMSE for alpha=1: 0.487
RMSE for alpha=10: 0.487


