## Import liabraries

In [204]:
## Import required frameworks
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

## Import 'housing.csv' data set

In [205]:
# Load the 'housing.csv' file to the Data Frame
csv_file = 'car_price_data.csv'
df_original = pd.read_csv(csv_file, delimiter=',')

# Show the columns names from "df_original"
columns = list(df_original.columns)
# print(columns) 

# Set the following columns from the Data Frame "df_original"
selected_columns = ['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']

df = df_original[selected_columns]

# Show the columns names from "df"
columns = list(df.columns)
# print(columns) 

# Make all columns' names lower cased
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Fill missing values with 0 for the selected features
df = df.fillna(0)

# Rename the "MSRP" column to "price"
df.rename(columns={'msrp': 'price'}, inplace=True)

## Question 1
What is the most frequent observation (mode) for the column transmission_type?

In [206]:
# Calculate the mode of the "Transmission Type" column
mode_transmission_type = df['transmission_type'].mode()[0]

print(f"The most frequent observation for Transmission Type is: {mode_transmission_type}")

The most frequent observation for Transmission Type is: AUTOMATIC


## Answer for Question 1: The most frequent observation for Transmission Type is: AUTOMATIC

## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

In [207]:
# Select the following features for the correlation matrix
selected_features = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

# Create a correlation matrix for the selected features
correlation_matrix = df[selected_features].corr()

print(correlation_matrix)

                      year  engine_hp  engine_cylinders  highway_mpg  city_mpg
year              1.000000   0.338714         -0.040708     0.258240  0.198171
engine_hp         0.338714   1.000000          0.774851    -0.415707 -0.424918
engine_cylinders -0.040708   0.774851          1.000000    -0.614541 -0.587306
highway_mpg       0.258240  -0.415707         -0.614541     1.000000  0.886829
city_mpg          0.198171  -0.424918         -0.587306     0.886829  1.000000


## Answer for Question 2: The features "highway_mpg" and "city_mpg" have the biggest correlation coefficient over all pairs of coefficients

Make price binary

In [208]:
# Convert prices of cars into a binary format

# Calculate the mean of the "price" column
mean_price = df['price'].mean()

# Convert values to binary distribution
df['above_average'] = df['price'].apply(lambda x: 1 if x >= mean_price else 0)

Split the data

In [209]:
# Separate the target variable from the features
X = df.drop(columns=['above_average'])
y = df['above_average']

# Split the data into train/val/test sets with a 60%/20%/20% distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets to verify the distribution
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (7148, 11), y_train shape: (7148,)
X_val shape: (2383, 11), y_val shape: (2383,)
X_test shape: (2383, 11), y_test shape: (2383,)


In [210]:
print(X_train.head())

            make            model  year                engine_fuel_type  \
4949      Nissan         Frontier  2015                regular unleaded   
579         FIAT             500L  2016  premium unleaded (recommended)   
8364        Ford           Ranger  2011                regular unleaded   
8775   Chevrolet             S-10  2003        flex-fuel (unleaded/E85)   
7370  Mitsubishi  Outlander Sport  2016                regular unleaded   

      engine_hp  engine_cylinders transmission_type        vehicle_style  \
4949      261.0               6.0         AUTOMATIC      Crew Cab Pickup   
579       160.0               4.0            MANUAL                Wagon   
8364      207.0               6.0            MANUAL  Extended Cab Pickup   
8775      120.0               4.0            MANUAL  Extended Cab Pickup   
7370      168.0               4.0         AUTOMATIC              4dr SUV   

      highway_mpg  city_mpg  price  
4949           21        15  32560  
579            33 

In [211]:
print(y_train[:5])

4949    0
579     0
8364    0
8775    0
7370    0
Name: above_average, dtype: int64


## Question 3
Calculate the mutual information score

In [212]:
# "above_average" VS "make" variable
round(mutual_info_score(y_train, X_train.make), 2)

0.24

In [213]:
# "above_average" VS "model" variable
round(mutual_info_score(y_train, X_train.model), 2)

0.46

In [214]:
# "above_average" VS "transmission_type" variable
round(mutual_info_score(y_train, X_train.transmission_type), 2)

0.02

In [215]:
# "above_average" VS "vehicle_style" variable
round(mutual_info_score(y_train, X_train.vehicle_style), 2)

0.08

In [178]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [216]:
categorical = ["make", "model", "transmission_type", "vehicle_style"]

mi = X_train[categorical].apply(mutual_info_churn_score)
round(mi.sort_values(ascending=False), 2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

## Answer for Question 3: The minimum information score has a pair of "above_average" (y_test) and "transmission_type" (X_test)

## Question 4
Train a Logistic regression with all categorical features

In [217]:
# Load the required libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [218]:
# Define the categorical columns that need one-hot encoding
categorical_columns = ["make", "model", "transmission_type", "vehicle_style"]

In [219]:
# Define a transformer for one-hot encoding categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [220]:
# Use ColumnTransformer to apply the appropriate transformations to each column
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ])

In [221]:
# Define the logistic regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [222]:
# Create a pipeline that combines the preprocessor and the logistic regression model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

In [223]:
# Fit the model on the training dataset
clf.fit(X_train, y_train)



In [224]:
# Predict on the validation dataset
y_pred = clf.predict(X_val)

In [225]:
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits
accuracy = round(accuracy_score(y_val, y_pred), 2)

In [226]:
# Print the accuracy
print(f"Accuracy on the validation dataset: {accuracy}")

Accuracy on the validation dataset: 0.94


## Answer for Question 4: The accuracy for Linear Regression model, including all features, equal to 0.94 (0.95)

## Question 5
Train a Logistic regression by eliminating each of categorical features

In [247]:
from sklearn.base import clone

# Assuming as we have X_train, y_train, X_val, and y_val as the training and validation sets

# Define the categorical columns for feature elimination
categorical_columns = ['make', 'model', 'transmission_type', 'vehicle_style']

# Define a transformer for one-hot encoding categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Use ColumnTransformer to apply the appropriate transformations to each column
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ])

# Define the logistic regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

# Create a pipeline that combines the preprocessor and the logistic regression model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Fit the model on the training dataset with all features
clf.fit(X_train, y_train)

# Calculate the accuracy with all features
original_accuracy = accuracy_score(y_val, clf.predict(X_val))

print(round(original_accuracy, 2))

0.94




In [249]:
# Clone the original model
cloned_model = clone(clf)

# Remove the current feature from X_train and X_val
X_train_no_feature = X_train.loc[:, X_train.columns != 'make']
X_val_no_feature = X_val.loc[:, X_val.columns != 'make']


# Fit the model without the current feature
cloned_model.fit(X_train_no_feature, y_train)

# Calculate accuracy without the current feature
accuracy_no_feature = accuracy_score(y_val, cloned_model.predict(X_val_no_feature))

# Calculate the difference in accuracy
accuracy_difference = original_accuracy - accuracy_no_feature

print(accuracy_difference)

ValueError: A given column is not a column of the dataframe

In [250]:
# Define the categorical columns for feature elimination
categorical_columns = ['make', 'model', 'transmission_type', 'vehicle_style']

# Dictionary to store accuracy differences
accuracy_differences = {}

# Iterate through each feature for elimination
for feature in categorical_columns:
    # Clone the original model
    cloned_model = clone(clf)
    
    # Remove the current feature from X_train and X_val
    X_train_no_feature = X_train.drop(columns=[feature])
    X_val_no_feature = X_val.drop(columns=[feature])
    
    # Fit the model without the current feature
    cloned_model.fit(X_train_no_feature, y_train)
    
    # Calculate accuracy without the current feature
    accuracy_no_feature = accuracy_score(y_val, cloned_model.predict(X_val_no_feature))
    
    # Calculate the difference in accuracy
    accuracy_difference = original_accuracy - accuracy_no_feature
    
    # Store the accuracy difference in the dictionary
    accuracy_differences[feature] = accuracy_difference

    # Print the accuracy differences
for feature, difference in accuracy_differences.items():
    print(f"Feature: {feature}, Accuracy Difference: {round(difference, 4)}")

ValueError: A given column is not a column of the dataframe