In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv('data.csv')

In [11]:
features = ['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']

In [12]:
df = df[features]

In [13]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [14]:
df.engine_hp = df.engine_hp.fillna(0)
df.engine_cylinders = df.engine_cylinders.fillna(0)

In [15]:
df.isna().any()

make                 False
model                False
year                 False
engine_hp            False
engine_cylinders     False
transmission_type    False
vehicle_style        False
highway_mpg          False
city_mpg             False
msrp                 False
dtype: bool

In [16]:
df = df.rename(columns={'msrp': 'price'})

In [17]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [18]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [19]:
# Q1
df.transmission_type.value_counts()

automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: transmission_type, dtype: int64

In [20]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [21]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [22]:
df[numerical].head()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
0,2011,335.0,6.0,26,19
1,2011,300.0,6.0,28,19
2,2011,300.0,6.0,28,20
3,2011,230.0,6.0,28,18
4,2011,230.0,6.0,28,18


In [23]:
# Q2
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [24]:
df.price.mean()

40594.737032063116

In [25]:
df['above_average'] = (df.price > df.price.mean()).astype(int)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [28]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [29]:
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [30]:
y_train_price = df_train.price.values
y_val_price = df_val.price.values
y_test_price = df_test.price.values

y_train_above_average = df_train.above_average.values
y_val_above_average = df_val.above_average.values
y_test_above_average = df_test.above_average.values

In [31]:
del df_train['price']
del df_val['price']
del df_test['price']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [32]:
df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15
1997,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17
5216,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12
2805,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20
11369,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20


In [33]:
y_train_price

array([ 33599,  26245, 248000, ...,  28345,   2000,  40220])

In [34]:
from sklearn.metrics import mutual_info_score

In [35]:
def mutual_info_above_avg_score(series):
    score = mutual_info_score(series, y_train_above_average)
    return round(score, 2)

In [37]:
# Q3
df_train[categorical].apply(mutual_info_above_avg_score)

make                 0.24
model                0.46
transmission_type    0.02
vehicle_style        0.08
dtype: float64

In [38]:
from sklearn.feature_extraction import DictVectorizer

In [44]:
dv = DictVectorizer(sparse=False)

In [45]:
train_dict = df_train[categorical].to_dict(orient='records')

In [46]:
X_train = dv.fit_transform(train_dict)

In [47]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])