In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
df = df[["Make",
"Model",
"Year",
"Engine HP",
"Engine Cylinders",
"Transmission Type",
"Vehicle Style",
"highway MPG",
"city mpg",
"MSRP"]]

In [5]:
df.columns = df.columns.str.replace(" ", "_").str.lower()

In [6]:
df =df.fillna(0)

In [7]:
df = df.rename(columns={"msrp": "price"})

### Q1

In [8]:
df["transmission_type"].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

### Q2

In [9]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [10]:
numeric_features = df._get_numeric_data().columns

In [11]:
df[numeric_features].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [12]:
threshold = df["price"].mean()
df["above_average"] = df["price"].apply(lambda x: 1 if x > threshold else 0)

In [13]:
df = df.drop(columns="price")

In [14]:
df_train, df_test, y_train, y_test = train_test_split(df.drop(columns="above_average"), df["above_average"], test_size=0.2, random_state=42)

df_train, df_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.25, random_state=42)

### Q3

In [15]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
for feature in categorical_features:
    print(feature, mutual_info_score(y_train, df_train[feature]).round(2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


### Q4

In [16]:
df_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15
1997,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17
5216,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12
2805,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20
11369,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20
...,...,...,...,...,...,...,...,...,...
9232,Toyota,Sienna,2016,266.0,6.0,AUTOMATIC,Passenger Minivan,25,18
5710,Chevrolet,HHR,2009,260.0,4.0,MANUAL,Wagon,29,21
11306,Hyundai,Veracruz,2012,260.0,6.0,AUTOMATIC,4dr SUV,22,17
4414,Mitsubishi,Expo,1993,136.0,4.0,MANUAL,2dr Hatchback,26,19


In [17]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [18]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_val)
original_accuracy = accuracy_score(y_val, pred)
original_accuracy.round(2)

0.93

### Q5

In [19]:
for feature in df_train.columns:
    print(feature)
    dv = DictVectorizer(sparse=False)

    train_dict = df_train.drop(columns=feature).to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.drop(columns=feature).to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    pred = model.predict(X_val)
    new_accuracy = accuracy_score(y_val, pred)#.round(2)
    # print(new_accuracy)
    print(original_accuracy - new_accuracy)

make


-0.014267729752412972
model
0.018044481745698726
year
-0.013848090642047839
engine_hp
-0.00041963911036513313
engine_cylinders
-0.01258917331095255
transmission_type
-0.010490977759127218
vehicle_style
0.0025178346621904657
highway_mpg
-0.012169534200587528
city_mpg
-0.011330255979857373


### Q6

In [20]:
df = pd.read_csv("data.csv")
df = df[["Make",
"Model",
"Year",
"Engine HP",
"Engine Cylinders",
"Transmission Type",
"Vehicle Style",
"highway MPG",
"city mpg",
"MSRP"]]
df.columns = df.columns.str.replace(" ", "_").str.lower()
df = df.fillna(0)
df = df.rename(columns={"msrp": "price"})

In [21]:
df["price"] = np.log1p(df["price"])

In [22]:
df_train, df_test, y_train, y_test = train_test_split(df.drop(columns="price"), df["price"], test_size=0.2, random_state=42)

df_train, df_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.25, random_state=42)

In [25]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.drop(columns=feature).to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.drop(columns=feature).to_dict(orient='records')
X_val = dv.transform(val_dict)


In [26]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    
    model = Ridge(solver="sag",alpha=alpha, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    print(alpha)
    print(mean_squared_error(y_val, pred).round(2))



0
0.24




0.01
0.24




0.1
0.24




1
0.24
10
0.24


