In [7]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load Dataset
df = pd.read_csv("data.csv")  # Ensure the correct path

df.drop(columns=['id'], inplace=True)  # Drop ID column

# Encode categorical features
label_encoders = {}
categorical_columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Response'])
y = df['Response']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Define models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Hyperparameter grids
param_grids = {
    "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [10, 20, None]},
    "XGBoost": {"n_estimators": [50, 100, 200], "max_depth": [3, 6, 10], "learning_rate": [0.01, 0.1, 0.2]},
    "Decision Tree": {"max_depth": [5, 10, 20, None], "min_samples_split": [2, 5, 10]},
    "Gradient Boosting": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2], "max_depth": [3, 6, 10]}
}

# Train models with hyperparameter tuning
best_models = {}
model_performance = {}

for name, model in models.items():
    random_search = RandomizedSearchCV(model, param_grids[name], n_iter=5, scoring='f1', cv=3, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    # Best model from tuning
    best_model = random_search.best_estimator_
    best_models[name] = best_model

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    model_performance[name] = {"Accuracy": accuracy, "F1 Score": f1, "Best Params": random_search.best_params_}

# Display best model performance
for model, metrics in model_performance.items():
    print(f"{model}: Accuracy = {metrics['Accuracy']:.4f}, F1 Score = {metrics['F1 Score']:.4f}")
    print(f"Best Params: {metrics['Best Params']}")
    print("-" * 60)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Random Forest: Accuracy = 0.8670, F1 Score = 0.1827
Best Params: {'n_estimators': 100, 'max_depth': None}
------------------------------------------------------------
XGBoost: Accuracy = 0.8774, F1 Score = 0.0089
Best Params: {'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.2}
------------------------------------------------------------
Decision Tree: Accuracy = 0.8240, F1 Score = 0.2983
Best Params: {'min_samples_split': 2, 'max_depth': None}
------------------------------------------------------------
Gradient Boosting: Accuracy = 0.8765, F1 Score = 0.0167
Best Params: {'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.2}
------------------------------------------------------------


In [6]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE

# Load Dataset
df = pd.read_csv("data.csv")  # Ensure the correct path

df.drop(columns=['id'], inplace=True)  # Drop ID column

# Encode categorical features
label_encoders = {}
categorical_columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Response'])
y = df['Response']

# Balance dataset using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Define models
models = {
    "Random Forest": RandomForestClassifier,
    "XGBoost": XGBClassifier,
    "Decision Tree": DecisionTreeClassifier,
    "Gradient Boosting": GradientBoostingClassifier
}

# Optuna objective function
def objective(trial, model_name):
    params = {}
    if model_name == "Random Forest":
        params = {
            "n_estimators": trial.suggest_int("rf_n_estimators", 50, 500, step=50),
            "max_depth": trial.suggest_int("rf_max_depth", 5, 50, step=5),
            "min_samples_split": trial.suggest_int("rf_min_samples_split", 2, 50, step=2),
            "min_samples_leaf": trial.suggest_int("rf_min_samples_leaf", 1, 20, step=1),
            "max_features": trial.suggest_categorical("rf_max_features", ["sqrt", "log2", None])
        }
    elif model_name == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("xgb_n_estimators", 50, 500, step=50),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 20, step=2),
            "learning_rate": trial.suggest_float("xgb_learning_rate", 0.01, 0.5, log=True),
            "subsample": trial.suggest_float("xgb_subsample", 0.5, 1.0, step=0.1),
            "colsample_bytree": trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0, step=0.1)
        }
    elif model_name == "Decision Tree":
        params = {
            "max_depth": trial.suggest_int("dt_max_depth", 5, 50, step=5),
            "min_samples_split": trial.suggest_int("dt_min_samples_split", 2, 50, step=2),
            "min_samples_leaf": trial.suggest_int("dt_min_samples_leaf", 1, 20, step=1),
            "max_features": trial.suggest_categorical("dt_max_features", ["sqrt", "log2", None])
        }
    elif model_name == "Gradient Boosting":
        params = {
            "n_estimators": trial.suggest_int("gb_n_estimators", 50, 500, step=50),
            "learning_rate": trial.suggest_float("gb_learning_rate", 0.01, 0.5, log=True),
            "max_depth": trial.suggest_int("gb_max_depth", 3, 20, step=2),
            "subsample": trial.suggest_float("gb_subsample", 0.5, 1.0, step=0.1),
            "min_samples_split": trial.suggest_int("gb_min_samples_split", 2, 50, step=2),
            "min_samples_leaf": trial.suggest_int("gb_min_samples_leaf", 1, 20, step=1)
        }
    
    model = models[model_name](**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred)

# Run Optuna optimization
best_models = {}
model_performance = {}

for model_name in models.keys():
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, model_name), n_trials=2)
    
    best_params = {key.split('_', 1)[-1]: value for key, value in study.best_params.items()}
    best_model = models[model_name](**best_params)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    best_models[model_name] = best_model
    model_performance[model_name] = {"Accuracy": accuracy, "F1 Score": f1, "Best Params": best_params}

# Display best model performance
for model, metrics in model_performance.items():
    print(f"{model}: Accuracy = {metrics['Accuracy']:.4f}, F1 Score = {metrics['F1 Score']:.4f}")
    print(f"Best Params: {metrics['Best Params']}")
    print("-" * 60)

[I 2025-04-02 13:51:50,582] A new study created in memory with name: no-name-322eef11-5ff4-4f39-a08c-c230e7786eb4
[I 2025-04-02 13:54:45,007] Trial 0 finished with value: 0.8757370950017597 and parameters: {'rf_n_estimators': 300, 'rf_max_depth': 45, 'rf_min_samples_split': 16, 'rf_min_samples_leaf': 7, 'rf_max_features': 'log2'}. Best is trial 0 with value: 0.8757370950017597.
[I 2025-04-02 13:56:38,968] Trial 1 finished with value: 0.8726142475524776 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 45, 'rf_min_samples_split': 26, 'rf_min_samples_leaf': 8, 'rf_max_features': 'sqrt'}. Best is trial 0 with value: 0.8757370950017597.
[I 2025-04-02 13:59:32,527] A new study created in memory with name: no-name-f11ac1ce-ef02-43db-82d9-0513cfe0734a
[I 2025-04-02 13:59:34,183] Trial 0 finished with value: 0.8688490925442044 and parameters: {'xgb_n_estimators': 100, 'xgb_max_depth': 13, 'xgb_learning_rate': 0.028366715675972178, 'xgb_subsample': 0.5, 'xgb_colsample_bytree': 0.8}. Best

Random Forest: Accuracy = 0.8709, F1 Score = 0.8758
Best Params: {'n_estimators': 300, 'max_depth': 45, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 'log2'}
------------------------------------------------------------
XGBoost: Accuracy = 0.8774, F1 Score = 0.8791
Best Params: {'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.15387091494428515, 'subsample': 0.7, 'colsample_bytree': 0.5}
------------------------------------------------------------
Decision Tree: Accuracy = 0.8481, F1 Score = 0.8515
Best Params: {'max_depth': 35, 'min_samples_split': 4, 'min_samples_leaf': 9, 'max_features': 'log2'}
------------------------------------------------------------
Gradient Boosting: Accuracy = 0.8822, F1 Score = 0.8859
Best Params: {'n_estimators': 100, 'learning_rate': 0.02416172643766952, 'max_depth': 17, 'subsample': 0.8, 'min_samples_split': 22, 'min_samples_leaf': 18}
------------------------------------------------------------
