In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')

In [None]:
df.head(5)

In [None]:
selected_columns = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']

In [None]:
df = df[selected_columns]

In [None]:
df

In [None]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df = df.fillna(0)

In [None]:
df

In [None]:
df = df.rename(columns = {'msrp' : 'price'})

In [None]:
df

In [None]:
df['transmission_type'].mode()

In [None]:
import pandas as pd

numerical_columns = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

correlation_matrix = df[numerical_columns].corr()

# Finding the features with the largest correlation
highest_corr = correlation_matrix.abs().unstack().sort_values(ascending=False).drop_duplicates()
top_corr_features = highest_corr[highest_corr.index.get_level_values(0) != highest_corr.index.get_level_values(1)].index[:2]

# Print the correlation matrix and the two features with the biggest correlation
print("Correlation Matrix:")
print(correlation_matrix)
print("\nTwo features with the highest correlation:")
print(top_corr_features)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mutual_info_score

# Making price binary
df['above_average'] = (df['price'] > df['price'].mean()).astype(int)

# Splitting the data into train/val/test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 20% of the original data for validation

# Removing the target variable from the datasets
X_train = train_data.drop('above_average', axis=1)
X_val = val_data.drop('above_average', axis=1)

# Selecting categorical features
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Encoding categorical features
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(X_train[categorical_features].to_dict(orient='records'))

# Calculating mutual information scores
mi_scores = []
for feature in categorical_features:
    mi_score = mutual_info_score(X_train_encoded[:, dv.feature_names_.index(f'{feature}={value}')], train_data['above_average'])
    mi_scores.append((feature, round(mi_score, 2)))

# Printing the mutual information scores
print("Mutual Information Scores:")
for feature, score in mi_scores:
    print(f"{feature}: {score}")

# Finding the variable with the lowest mutual information score
lowest_mi_variable = min(mi_scores, key=lambda x: x[1])[0]
print("\nVariable with the lowest mutual information score:", lowest_mi_variable)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate the target variable 'above_average' and features
X_train = train_data.drop(['price', 'above_average'], axis=1)
y_train = train_data['above_average']
X_val = val_data.drop(['price', 'above_average'], axis=1)
y_val = val_data['above_average']

# Define categorical features for one-hot encoding
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Create a preprocessor to handle one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create a pipeline with logistic regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42))
])

# Fit the model on the training dataset
model.fit(X_train, y_train)

# Predict on the validation dataset
y_val_pred = model.predict(X_val)

# Calculate accuracy on the validation dataset
accuracy_val = accuracy_score(y_val, y_val_pred)

# Print the accuracy
print("Accuracy on the validation dataset:", round(accuracy_val, 2))


In [None]:
from sklearn.base import clone

# Separate the target variable 'above_average' and features
X_train = train_data.drop(['price', 'above_average'], axis=1)
y_train = train_data['above_average']
X_val = val_data.drop(['price', 'above_average'], axis=1)
y_val = val_data['above_average']

# Define categorical features for one-hot encoding
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']

# Create a preprocessor to handle one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create a pipeline with logistic regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42))
])

# Fit the model on the training dataset
model.fit(X_train, y_train)

# Calculate accuracy on the validation dataset (original accuracy)
accuracy_original = accuracy_score(y_val, model.predict(X_val))

# Calculate the difference for each feature
feature_differences = {}
for feature in X_train.columns:
    # Clone the original model
    model_clone = clone(model)
    
    # Exclude the current feature
    X_train_excluded = X_train.drop(feature, axis=1)
    X_val_excluded = X_val.drop(feature, axis=1)
    
    # Fit the model without the current feature
    model_clone.fit(X_train_excluded, y_train)
    
    # Calculate accuracy without the current feature
    accuracy_excluded = accuracy_score(y_val, model_clone.predict(X_val_excluded))
    
    # Calculate the difference
    difference = accuracy_original - accuracy_excluded
    
    # Store the difference for the current feature
    feature_differences[feature] = difference

# Find the feature with the smallest difference
smallest_difference_feature = min(feature_differences, key=feature_differences.get)

# Print the differences for each feature
print("Differences for each feature:")
for feature, difference in feature_differences.items():
    print(f"{feature}: {round(difference, 2)}")

print("\nFeature with the smallest difference:", smallest_difference_feature)


In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Separate the target variable 'above_average' and features
X_train = train_data.drop(['price', 'above_average'], axis=1)
y_train = train_data['price']
X_val = val_data.drop(['price', 'above_average'], axis=1)
y_val = val_data['price']

# Apply logarithmic transformation to the target variable 'price'
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Define alpha values to try
alphas = [0, 0.01, 0.1, 1, 10]

# Create a pipeline with Ridge regression model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge(solver='sag', random_state=42))
])

# Dictionary to store RMSE scores for each alpha
rmse_scores = {}

# Fit the model for each alpha and calculate RMSE on the validation set
for alpha in alphas:
    pipeline.set_params(regressor__alpha=alpha)
    pipeline.fit(X_train, y_train_log)
    y_val_log_pred = pipeline.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val_log, y_val_log_pred))
    rmse_scores[alpha] = rmse

# Find the alpha with the best RMSE
best_alpha = min(rmse_scores, key=rmse_scores.get)

# Print the RMSE scores and the best alpha
print("RMSE scores for each alpha:")
for alpha, rmse in rmse_scores.items():
    print(f"Alpha {alpha}: {round(rmse, 3)}")

print("\nBest alpha:", best_alpha)
