In [4]:
#Load the dataset(cross_sell_rai.csv)

#Perform Preprocessing and Feature Engineering

#Build a Ml Model

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE


df = pd.read_csv('C:/Users/drbha/Downloads/cross_sell_rai (1).csv')
df['Gender'] = df['Gender'].map({'Male' : 1, 'Female' :0})
df['Vehicle_Age'] = df['Vehicle_Age'].replace({'< 1 Year' : 1, '1-2 Year' :2,'> 2 Years' : 3})
df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes' : 1, 'No' :0})


from sklearn.preprocessing import StandardScaler

features = df.drop(['id', 'Response'], axis=1)
target = df['Response']

features_to_standardize = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()

features[features_to_standardize] = scaler.fit_transform(features[features_to_standardize])

df_standardized = pd.concat([df[['id']], features, df[['Response']]], axis=1)

X = df_standardized.drop(["Response", 'id'], axis=1)
y = df_standardized["Response"]

smote = SMOTE(random_state=101)
X_resampled, y_resampled = smote.fit_resample(X, y)
y_resampled.value_counts()

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=101,stratify=y_resampled)

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, precision_score, recall_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

params = {
    "objective": "binary",
    "metric": "auc",
    "max_depth": -1,
    "num_leaves": 10,
    "min_data_in_leaf": 20,
    "learning_rate": 0.03,
    "bagging_fraction": 0.9,
    "feature_fraction": 0.35,
    "feature_fraction_seed": 20,
    "bagging_freq": 10,
    "bagging_seed": 30,
    "min_child_weight": 0.09,
    "lambda_l1": 0.01,
    "verbosity": -1
}

lgb_model = LGBMClassifier(**params)
lgb_model.fit(X_train, y_train)

# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# XGBoost model
xgb_model = XGBClassifier(eval_metric='auc', use_label_encoder=False)
xgb_model.fit(X_train, y_train)



def model_evaluation(model, model_name):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]

    train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    val_auc = roc_auc_score(y_test, y_test_prob)
    val_f1 = f1_score(y_test, y_test_pred)
    val_precision = precision_score(y_test, y_test_pred)
    val_recall = recall_score(y_test, y_test_pred)
    val_accuracy = accuracy_score(y_test, y_test_pred)

    print(f'{model_name} Metrics:')
    print(f'Train AUC: {train_auc:.4f}')
    print(f'Val AUC: {val_auc:.4f}')
    print(f'Val F1: {val_f1:.4f}')
    print(f'Val Precision: {val_precision:.4f}')
    print(f'Val Recall: {val_recall:.4f}')
    print(f'Val Accuracy: {val_accuracy:.4f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}\n')

    return val_auc, val_f1

# Evaluate all models
results = []
results.append(("LightGBM", *model_evaluation(lgb_model, "LightGBM")))
results.append(("Logistic Regression", *model_evaluation(lr_model, "Logistic Regression")))
results.append(("Random Forest", *model_evaluation(rf_model, "Random Forest")))
results.append(("XGBoost", *model_evaluation(xgb_model, "XGBoost")))

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\drbha\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns,
    class_names=['Not Interested', 'Interested'],
    mode='classification'
)
# Choose an instance to explain
instance = X_test.iloc[0]

# Explain the instance's prediction
lime_explanation = explainer.explain_instance(
    data_row=instance.values,
    predict_fn=lgb_model.predict_proba
)

# Display the explanation
lime_explanation.show_in_notebook(show_table=True)
lime_explanation.as_pyplot_figure()
plt.show()
import pickle
with open('lime_explanation.pkl', 'wb') as f:
    pickle.dump(lime_explanation, f)
lime_explanation = lime_explanation.as_list()
import shap

# Initialize the TreeExplainer
explainer = shap.TreeExplainer(lgb_model)
shap_values_2d = shap_values.reshape(-1, shap_values.shape[-1])
# Create DataFrame
mean_shap_values = pd.DataFrame(shap_values_2d, columns=X_test.columns).mean().reset_index()
mean_shap_values.columns = ['Feature', 'SHAP Importance']
# Export to CSV
mean_shap_values.to_csv('shap_summary.csv', index=False)

In [None]:
# Read in the holdout dataset
holdout_df = pd.read_csv('cross_sell_holdout.csv')

# Perform the same preprocessing steps
holdout_df['Gender'] = holdout_df['Gender'].map({'Male': 1, 'Female': 0})
holdout_df['Vehicle_Age'] = holdout_df['Vehicle_Age'].replace({'< 1 Year': 1, '1-2 Year': 2, '> 2 Years': 3})
holdout_df['Vehicle_Damage'] = holdout_df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Standardize the same features
features_to_standardize = ['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
holdout_df[features_to_standardize] = scaler.fit_transform(holdout_df[features_to_standardize])

# Prepare the features for prediction
X_holdout = holdout_df.drop(['id'], axis=1)

# Make predictions using the LightGBM model (or any other model you prefer)
holdout_predictions = lgb_model.predict(X_holdout)

# Create a DataFrame with the predictions
prediction_df = pd.DataFrame({'id': holdout_df['id'], 'Response': holdout_predictions})

# Export the predictions to a CSV file
prediction_df.to_csv('predictions.csv', index=False)

print("Predictions have been saved to predictions.csv")