In [26]:
import numpy as np
import pandas as pd

full_df = pd.read_csv("car_prices.csv")
full_df.rename(columns = {'MSRP':'price'}, inplace = True)

df = full_df[["Make","Model","Year","Engine HP","Engine Cylinders","Transmission Type","Vehicle Style","highway MPG","city mpg"]]
df.columns = df.columns.str.lower().str.replace(' ','_')
df = df.fillna(0)

In [27]:
#Q1
print('Q1')
print(df.transmission_type.value_counts().index[0])

Q1
AUTOMATIC


In [28]:
#Q2
print('Q2')
df_numericals = df.loc[:,(df.dtypes == 'int64')|(df.dtypes == 'float64')]
print(df_numericals.corrwith(full_df.price).abs().sort_values(ascending=False).index.to_list()[:2])


Q2
['engine_hp', 'engine_cylinders']


In [29]:
#ONE-HOT ENCODING
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

df_dict = df.to_dict(orient='records')
x_df = dv.fit_transform(df_dict)

#DATA PREPERATION
from sklearn.model_selection import train_test_split

above_average = (full_df.price > full_df.price.mean()).astype(int)

df_train_valid , df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train , df_valid = train_test_split(df_train_valid,test_size=0.25,random_state=42)

x_train_valid , x_test = train_test_split(x_df,test_size=0.2,random_state=42)
x_train , x_valid = train_test_split(x_train_valid,test_size=0.25,random_state=42)

abo_avg_train_valid, abo_avg_test = train_test_split(above_average,test_size=0.2,random_state=42)
abo_avg_train , abo_avg_valid = train_test_split(abo_avg_train_valid,test_size=0.25,random_state=42)


In [30]:
#Q3
print('Q3')
from sklearn.metrics import mutual_info_score
mut_scores = df_train.apply(lambda x : mutual_info_score(abo_avg_train,x))
mut_scores = mut_scores.sort_values()
mut_scores[:1]

Q3


transmission_type    0.020958
dtype: float64

In [31]:
#Q4

#logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

model.fit(x_train,abo_avg_train)

y_pred = model.predict(x_valid)

acc = (y_pred == abo_avg_valid).mean()
print(round(acc,2))

0.95


In [32]:
#Q5
print('Q5')
def give_diff(c):
    t_df = df.drop(c,axis=1)

    t_df_dict = t_df.to_dict(orient='records')
    t_x_df = dv.fit_transform(t_df_dict)
    
    t_x_train_valid , t_x_test = train_test_split(t_x_df,test_size=0.2,random_state=42)
    t_x_train , t_x_valid = train_test_split(t_x_train_valid,test_size=0.25,random_state=42)
    
    t_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    t_model.fit(t_x_train,abo_avg_train)
    t_y_pred = t_model.predict(t_x_valid)

    t_acc = (t_y_pred == abo_avg_valid).mean()
    return acc-t_acc

cols_accs = pd.DataFrame([[x,give_diff(x)] for x in df.columns.values])
cols_accs.columns = ['feature','difference']
cols_accs

Q5


Unnamed: 0,feature,difference
0,make,-0.003777
1,model,0.025598
2,year,-0.001259
3,engine_hp,0.022241
4,engine_cylinders,-0.001679
5,transmission_type,-0.000839
6,vehicle_style,0.001679
7,highway_mpg,-0.001679
8,city_mpg,0.012589


In [43]:
#Q6
print('Q6')
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
price = np.log1p(full_df.price)

price_train_valid , price_test= train_test_split(price,test_size=0.2,random_state=42)
price_train, price_valid = train_test_split(price_train_valid,test_size=0.25,random_state=42)

alphas=[0, 0.01, 0.1, 1, 10]
results={}
for a in alphas:
    ridge_model = Ridge(alpha=a,random_state=42)
    ridge_model.fit(x_train,price_train)

    y_pred = ridge_model.predict(x_valid)
    rmse = np.sqrt(mean_squared_error(price_valid, y_pred))
    results[a] = rmse
    
results

Q6


{0: 6049101209888.719,
 0.01: 0.22445872016685056,
 0.1: 0.2146592009879433,
 1: 0.22957450706455343,
 10: 0.32033174012236765}