In [2]:
import pandas as pd
import numpy as np
import joblib
import os
import sys
import subprocess
 
# 1. Ensure XGBoost is installed
try:
    from xgboost import XGBClassifier
except ImportError:
    print("Installing XGBoost...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost"])
    from xgboost import XGBClassifier
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
 
# 2. Load the BMW Dataset
df = pd.read_csv('BMW_cars_2012_2025.csv')
 
# 3. Target Creation: Classify as High-End (1) vs Standard (0) based on Median MSRP
median_msrp = df['MSRP_USD'].median()
df['Target'] = (df['MSRP_USD'] > median_msrp).astype(int)
 
# 4. Handle Missing Values
df['Fuel_Economy_City_mpg'] = df['Fuel_Economy_City_mpg'].fillna(df['Fuel_Economy_City_mpg'].median())
df['Fuel_Economy_Highway_mpg'] = df['Fuel_Economy_Highway_mpg'].fillna(df['Fuel_Economy_Highway_mpg'].median())
 
# 5. Label Encoding for Categorical Columns
le = LabelEncoder()
cat_cols = ['Series', 'Body_Type', 'Engine_Type', 'Drivetrain', 'Transmission']
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))
 
# 6. Feature Selection (Min 12 Features Required) [cite: 30]
features = [
    'Year', 'Displacement_L', 'Cylinders', 'Horsepower', 'Torque_lb_ft',
    '0_60_mph_sec', 'Top_Speed_mph', 'Fuel_Economy_City_mpg', 
    'Fuel_Economy_Highway_mpg', 'Seating_Capacity', 'Series', 'Body_Type'
]
 
X = df[features]
y = df['Target']
 
# 7. Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# 8. Define Models Dictionary [cite: 34-39]
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss')
}
 
# 9. Create 'model' folder and Train/Save Models 
if not os.path.exists('model'):
    os.makedirs('model')
 
results = []
 
print("Training models and saving .pkl files...")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    # Save Model
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, os.path.join('model', filename))
    # Evaluation
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred
    results.append({
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })
 
# 10. Output Comparison Table [cite: 70-71]
results_df = pd.DataFrame(results)
print("\n--- Model Comparison Table ---")
print(results_df.to_string(index=False))
 
# Verify file creation
print("\nFiles in 'model' directory:", os.listdir('model'))

Installing XGBoost...
Training models and saving .pkl files...

--- Model Comparison Table ---
      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression   0.96875 0.963563   0.950000 1.000000 0.974359 0.936442
      Decision Tree   0.96875 0.961538   0.950000 1.000000 0.974359 0.936442
                kNN   0.84375 0.910931   0.888889 0.842105 0.864865 0.681397
        Naive Bayes   0.87500 0.951417   0.857143 0.947368 0.900000 0.741001
      Random Forest   0.96875 0.995951   0.950000 1.000000 0.974359 0.936442
            XGBoost   0.96875 0.995951   0.950000 1.000000 0.974359 0.936442

Files in 'model' directory: ['decision_tree.pkl', 'knn.pkl', 'logistic_regression.pkl', 'naive_bayes.pkl', 'random_forest.pkl', 'xgboost.pkl']
