In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [5]:
url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
data = pd.read_csv(url)

In [6]:
data

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


In [7]:
# Select the required features and clean the column names
selected_features = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type','Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']


In [6]:
data = data[selected_features]


In [7]:
data

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [8]:
data.columns = data.columns.str.replace (' ', '_').str.lower()

In [9]:
# Fill missing values with 0
data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [10]:
# Rename MSRP to price
data.rename(columns={'msrp': 'price'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
most_frequent_transmission_type = data['transmission_type'].mode()[0]

In [12]:
print(f"The most frequent observation for 'transmission_type' is: {most_frequent_transmission_type}")

The most frequent observation for 'transmission_type' is: AUTOMATIC


In [13]:
correlation_matrix = data.corr()
# Identify the two features with the highest correlation
highest_correlation = correlation_matrix.unstack().sort_values(ascending=False)[2:4]

In [14]:
highest_correlation

city_mpg     city_mpg       1.0
highway_mpg  highway_mpg    1.0
dtype: float64

In [15]:
# Calculate the mean of the 'Price' variable
mean_price = data['price'].mean()

# Create the 'above_average' binary variable
data['above_average'] = (data['price'] > mean_price).astype(int)

# Display the first few rows of the updated dataset
print(data.head())

  make       model  year  engine_hp  engine_cylinders transmission_type  \
0  BMW  1 Series M  2011      335.0               6.0            MANUAL   
1  BMW    1 Series  2011      300.0               6.0            MANUAL   
2  BMW    1 Series  2011      300.0               6.0            MANUAL   
3  BMW    1 Series  2011      230.0               6.0            MANUAL   
4  BMW    1 Series  2011      230.0               6.0            MANUAL   

  vehicle_style  highway_mpg  city_mpg  price  above_average  
0         Coupe           26        19  46135              1  
1   Convertible           28        19  40650              1  
2         Coupe           28        20  36350              0  
3         Coupe           28        18  29450              0  
4   Convertible           28        18  34500              0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [16]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


In [17]:
num_rows = len(data)
print("Number of rows:", num_rows)

Number of rows: 11914


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [21]:
df_train = df_train.reset_index (drop=True)
df_val = df_val.reset_index (drop=True)
df_test = df_test.reset_index (drop=True)

In [22]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [23]:
df_full_train = df_full_train.reset_index(drop=True)

In [24]:
df_full_train.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
above_average        0
dtype: int64

In [25]:
df_full_train.year.price(normalize=True)

AttributeError: 'Series' object has no attribute 'price'

In [26]:
df_full_train.above_average.mean()

0.2767810303221068

In [27]:
categorical = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type','Vehicle Style', 'highway MPG', 'city mpg', 'price']

In [28]:
y_train

array([0, 0, 1, ..., 0, 0, 0])

Q 3 - Mutual information

In [29]:
from IPython.display import display
from sklearn.metrics import mutual_info_score

In [30]:
global_above_average = df_full_train.above_average.mean()
global_above_average

0.2767810303221068

In [31]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).above_average.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_above_average
    df_group['risk'] = df_group['mean'] / global_above_average
    display(df_group)
    print()
    print()

Make


KeyError: 'Make'

In [32]:
from sklearn.metrics import mutual_info_score

In [33]:
mutual_info_score(df_full_train.above_average, df_full_train.make)

0.2387236479073192

In [34]:
mutual_info_score(df_full_train.above_average, df_full_train.model)

0.46099440756035703

In [35]:
mutual_info_score(df_full_train.above_average, df_full_train.transmission_type)

0.020883575914971135

In [36]:
mutual_info_score(df_full_train.above_average, df_full_train.vehicle_style)

0.08339022741593435

In [37]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [38]:
df_full_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,Cadillac,CT6,2016,265.0,4.0,AUTOMATIC,Sedan,31,22,53495,1
1,Mercedes-Benz,GLS-Class,2017,449.0,8.0,AUTOMATIC,4dr SUV,18,14,93850,1
2,Kia,Forte,2016,173.0,4.0,AUTOMATIC,Coupe,34,25,19890,0
3,Dodge,RAM 250,1993,180.0,6.0,MANUAL,Regular Cab Pickup,16,11,2000,0
4,Hyundai,Tiburon,2008,172.0,6.0,AUTOMATIC,2dr Hatchback,24,17,21270,0
...,...,...,...,...,...,...,...,...,...,...,...
9526,Toyota,Venza,2014,181.0,4.0,AUTOMATIC,Wagon,26,20,27950,0
9527,Pontiac,G6,2009,219.0,6.0,AUTOMATIC,Sedan,26,17,24710,0
9528,Volkswagen,Golf GTI,2016,220.0,4.0,AUTOMATED_MANUAL,2dr Hatchback,33,25,27590,0
9529,Saab,9-5,2009,260.0,4.0,AUTOMATIC,Wagon,27,17,43270,1


In [39]:
def mutual_info_above_average_score(series):
    return mutual_info_score(series, df_full_train.above_average)

In [40]:
mi = df_full_train[categorical].apply(mutual_info_above_average_score)
mi.sort_values(ascending=False)

model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

*QUESTION 4*

In [41]:
from sklearn.feature_extraction import DictVectorizer

In [42]:
dv = DictVectorizer(sparse=False)

*numerical and categorical variables*

In [43]:
df_full_train = df_full_train.reset_index(drop=True)

In [44]:
df_full_train.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
above_average        0
dtype: int64

In [45]:
df_full_train.above_average.value_counts(normalize=True)

0    0.723219
1    0.276781
Name: above_average, dtype: float64

In [46]:
df_full_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
above_average          int32
dtype: object

In [47]:
df_full_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price', 'above_average'],
      dtype='object')

In [48]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [49]:
numerical = [ 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price'] 

In [50]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [51]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [53]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [54]:
pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('model', model)
])

In [55]:
pipeline.fit(X_train, y_train)


Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model',
                 LogisticRegression(C=10, max_iter=1000, random_state=42,
                                    solver='liblinear'))])

In [56]:
y_val_pred = pipeline.predict(X_val)

In [57]:
accuracy = round(accuracy_score(y_val, y_val_pred), 2)

In [58]:
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.97


*QUESTION 5*

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [60]:
accuracy_differences = {}

In [61]:
model.fit(X_train, y_train)
y_pred_all_features = model.predict(X_val)
accuracy_all_features = accuracy_score(y_val, y_pred_all_features)

In [62]:
for feature in features:
    # Find the index of the current feature in the list of column names
    feature_index = data.columns.tolist().index(feature)


NameError: name 'features' is not defined

In [63]:
for feature in features:
    # Find the index of the current feature in the list of column names
    feature_index = data.columns.tolist().index(feature)

    # Exclude the current feature from the feature set
    X_train_without_feature = np.delete(X_train, feature_index, axis=1)
    X_val_without_feature = np.delete(X_val, feature_index, axis=1)

    # Train a model without the current feature
    model.fit(X_train_without_feature, y_train)
    y_pred_without_feature = model.predict(X_val_without_feature)
    accuracy_without_feature = accuracy_score(y_val, y_pred_without_feature)

    # Calculate the difference in accuracy
    accuracy_diff = accuracy_all_features - accuracy_without_feature

    # Store the feature name and its accuracy difference in the dictionary
    feature_accuracy_diff[feature] = accuracy_diff

NameError: name 'features' is not defined

In [64]:
least_useful_feature = min(feature_accuracy_diff, key=feature_accuracy_diff.get)

# Print the least useful feature and its accuracy difference
print("Least useful feature:", least_useful_feature)
print("Accuracy difference:", feature_accuracy_diff[least_useful_feature])

NameError: name 'feature_accuracy_diff' is not defined

In [65]:
# Find the feature with the smallest difference
smallest_difference_feature = min(accuracy_differences, key=accuracy_differences.get)

# Print the feature with the smallest difference
print("Feature with the smallest difference:", smallest_difference_feature)

ValueError: min() arg is an empty sequence

QUESTION 5.2

In [66]:
features = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [67]:
feature_accuracy_diff = {}

In [68]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred_all_features = model.predict(X_val)
accuracy_all_features = accuracy_score(y_val, y_pred_all_features)

In [69]:
for feature in ['year', 'engine_hp', 'transmission_type', 'city_mpg']:
    # Create a copy of the training and validation data without the current feature
    X_train_without_feature = X_train.drop(columns=[feature])
    X_val_without_feature = X_val.drop(columns=[feature])

    # Train a model without the current feature
    model.fit(X_train_without_feature, y_train)
    y_pred_without_feature = model.predict(X_val_without_feature)
    accuracy_without_feature = accuracy_score(y_val, y_pred_without_feature)

    # Calculate the difference in accuracy
    accuracy_diff = accuracy_all_features - accuracy_without_feature

    # Store the feature name and its accuracy difference in the dictionary
    feature_accuracy_diff[feature] = accuracy_diff

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [70]:
# Find the feature with the smallest accuracy difference
least_useful_feature = min(feature_accuracy_diff, key=feature_accuracy_diff.get)

# Print the least useful feature and its accuracy difference
print("Least useful feature:", least_useful_feature)
print("Accuracy difference:", feature_accuracy_diff[least_useful_feature])

ValueError: min() arg is an empty sequence

*QUESTION 6*

In [71]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [72]:
# Apply logarithmic transformation to the 'price' column
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [73]:
# List of alpha values to try
alphas = [0, 0.01, 0.1, 1, 10]

In [74]:
# Initialize variables to store best RMSE and best alpha
best_rmse = float('inf')
best_alpha = None

In [75]:
# Loop through alpha values and fit Ridge models
for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train_log)
    
    # Predict on the validation set
    y_pred_log = model.predict(X_val)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val_log, y_pred_log))
    
    # Check if this alpha resulted in a better RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

In [76]:
best_rmse = round(best_rmse, 3)

In [77]:
print("Best alpha:", best_alpha)
print("Best RMSE:", best_rmse)

Best alpha: 0
Best RMSE: 0.283
