In [3]:
import wget
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
data= "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
data= wget.download(data)
df= pd.read_csv(data)

100% [..........................................................................] 1475504 / 1475504

# Preparing the dataset

In [6]:
df2 = df.copy()
df2= df[["Make","Model","Year","Engine HP","Engine Cylinders","Transmission Type","Vehicle Style","highway MPG","city mpg","MSRP"]]

In [8]:
df2.columns = df2.columns.str.lower().str.replace(' ', '_')

In [9]:
df2 = df2.fillna(0)

In [21]:
df2.rename(columns = {"msrp":"price"},inplace=True)

In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


The distribution has a right tail but it is not long

### Question 1

In [13]:
df2.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

The most frequent category is "Automatic"

### Question 2

In [14]:
Numerical = ["engine_hp","year","engine_cylinders","highway_mpg","city_mpg"]

In [15]:
Categorical =["make","model","transmission_type","vehicle_style"]

In [16]:
df2[Numerical].corr()

Unnamed: 0,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
engine_hp,1.0,0.338714,0.774851,-0.415707,-0.424918
year,0.338714,1.0,-0.040708,0.25824,0.198171
engine_cylinders,0.774851,-0.040708,1.0,-0.614541,-0.587306
highway_mpg,-0.415707,0.25824,-0.614541,1.0,0.886829
city_mpg,-0.424918,0.198171,-0.587306,0.886829,1.0


The variables with the biggest correlation are "highway_mpg" and "city_mpg"

#### Making price binary

In [39]:
df2["above_average"] = pd.Series(np.where(df2.price>df2.price.mean(),1,0))

In [42]:
df2

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


#### Split the data

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
df2_train_full, df2_test = train_test_split(df2, test_size=0.2, random_state=42)

In [45]:
df2_train, df2_val = train_test_split(df2_train_full, test_size=0.25, random_state=42)

In [46]:
y_train = df2_train.above_average.values
y_val = df2_val.above_average.values
y_test = df2_test.above_average.values

In [47]:
del df2_train["above_average"]
del df2_val["above_average"]
del df2_test["above_average"]

In [125]:
df2_train_full

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
3181,Cadillac,CT6,2016,265.0,4.0,AUTOMATIC,Sedan,31,22,53495,1
5357,Mercedes-Benz,GLS-Class,2017,449.0,8.0,AUTOMATIC,4dr SUV,18,14,93850,1
4874,Kia,Forte,2016,173.0,4.0,AUTOMATIC,Coupe,34,25,19890,0
8102,Dodge,RAM 250,1993,180.0,6.0,MANUAL,Regular Cab Pickup,16,11,2000,0
10400,Hyundai,Tiburon,2008,172.0,6.0,AUTOMATIC,2dr Hatchback,24,17,21270,0
...,...,...,...,...,...,...,...,...,...,...,...
11284,Toyota,Venza,2014,181.0,4.0,AUTOMATIC,Wagon,26,20,27950,0
5191,Pontiac,G6,2009,219.0,6.0,AUTOMATIC,Sedan,26,17,24710,0
5390,Volkswagen,Golf GTI,2016,220.0,4.0,AUTOMATED_MANUAL,2dr Hatchback,33,25,27590,0
860,Saab,9-5,2009,260.0,4.0,AUTOMATIC,Wagon,27,17,43270,1


### Question 3

In [48]:
from sklearn.metrics import mutual_info_score

In [60]:
def calculate_mi(series):
    return mutual_info_score(series, df2_train_full.above_average)

df_mi = df2_train_full[Categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')



display(round(df_mi,2))

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


The variable transmission_type has the lowest mutual information score

### Question 4

#### One hot encoding

In [68]:
from sklearn.feature_extraction import DictVectorizer

In [69]:
train_dict = df2_train[Categorical + Numerical].to_dict(orient='records')
# Recuerdese que puede pasarse tanto variables numericas omo categoricas 
# porque ela utomaticamente entiende que solo debe hot encodar categoricas

In [70]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

#### Logistic regression

In [71]:
from sklearn.linear_model import LogisticRegression

In [72]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [74]:
val_dict = df2_val[Categorical + Numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [75]:
model.predict_proba(X_val)

array([[9.99692966e-01, 3.07033780e-04],
       [2.27974745e-03, 9.97720253e-01],
       [9.99930206e-01, 6.97938861e-05],
       ...,
       [9.99898957e-01, 1.01043039e-04],
       [8.98629570e-03, 9.91013704e-01],
       [9.08719100e-03, 9.90912809e-01]])

In [76]:
y_pred = model.predict_proba(X_val)[:, 1] # Hacemos esto para solo tomar las
# predicciones de la probabilidad de la variable "price" igual a 1

In [98]:
global_acuracy = round((y_val == (y_pred > 0.5)).mean(),2)
global_acuracy

0.95

### Question 5

In [99]:
variables = ["year","engine_hp","transmission_type","city_mpg"]

In [104]:
total_variables= Categorical + Numerical
total_variables

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'engine_hp',
 'year',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [117]:
for i in variables:
    Varables_filtradas = [j for j in total_variables if j !=i]
    train_dic2=df2_train[Varables_filtradas].to_dict(orient='records')
    
    X_train2 = dv.fit_transform(train_dic2)
    
    model.fit(X_train2, y_train)
    
    val_dict2 = df2_val[Varables_filtradas].to_dict(orient='records')
    X_val2 = dv.transform(val_dict2)
    
    y_pred = model.predict_proba(X_val2)[:, 1]
    feature_eliminated_acuracy = (y_val == (y_pred > 0.5)).mean()
    
    result = global_acuracy-feature_eliminated_acuracy
    print(f'{i}:{result}')

year:0.0020352496852705793
engine_hp:0.019660092320604283
transmission_type:0.004972723457826178
city_mpg:0.004133445237096023


The feature with the smallest difference is year, which means that it is the less important variable in the model for reaching high accuracy.

### Question 6

In [135]:
df2["log_price"]=np.log1p(df2.price)

In [129]:
df3_train_full, df3_test = train_test_split(df2, test_size=0.2, random_state=42)

df3_train, df3_val = train_test_split(df3_train_full, test_size=0.25, random_state=42)

y_train2 = df3_train.log_price.values
y_val2 = df3_val.log_price.values
y_test2 = df3_test.log_price.values

In [142]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

In [140]:
alpha_values = [0, 0.01, 0.1, 1, 10]

In [145]:
np.random.seed(42)

for i in alpha_values:
    riged_linear_model = Ridge(alpha=i)
    riged_linear_model.fit(X_train, y_train2)
    y_pred=riged_linear_model.predict(X_val)
    print(f'{i} : {round(np.sqrt(mean_squared_error(y_val2,y_pred)),3)}')
    

0 : 4107614408242.89
0.01 : 0.224
0.1 : 0.215
1 : 0.23
10 : 0.32


In [146]:
np.random.seed(42)

for i in alpha_values:
    riged_linear_model = Ridge(alpha=i,solver='sag')
    riged_linear_model.fit(X_train, y_train2)
    y_pred=riged_linear_model.predict(X_val)
    print(f'{i} : {round(np.sqrt(mean_squared_error(y_val2,y_pred)),3)}')



0 : 0.487




0.01 : 0.487




0.1 : 0.487




1 : 0.487
10 : 0.487


