In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
df = pd.read_csv('./merged_df.csv')

In [23]:
clean_features = [
    'Engine_Displacement_L', 'Engine_Cylinders', 'Air_Pollution_Score',
    'Greenhouse_Gas_Score', 'Combined_Co2',
]

In [24]:
for col in ['Transmission_Type', 'Drive', 'Fuel', 'Cert_Region', 'Veh_Class']:
  df[col] = LabelEncoder().fit_transform(df[col])
  clean_features.append(col)

In [25]:
# Prepare the data again
X_clean = df[clean_features]
y = df['City_Mpg']
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y, test_size=0.2, random_state=42)

# Retrain the decision tree model
dt_clean = DecisionTreeRegressor(random_state=42)
dt_clean.fit(X_train_clean, y_train_clean)
y_pred_clean = dt_clean.predict(X_test_clean)

mse = mean_squared_error(y_test_clean, y_pred_clean)
rmse = np.sqrt(mse)

dt_clean_metrics = {
    "rmse": rmse,
    "r2": r2_score(y_test_clean, y_pred_clean)
}

dt_clean_metrics



{'rmse': np.float64(0.6954676811023496), 'r2': 0.9953074018121737}

Save Decision Tree

In [26]:
import joblib

model_path = './model/mpg_decision_tree_model.pkl'
joblib.dump(dt_clean, model_path)

['./model/mpg_decision_tree_model.pkl']