# Predictive Challenge: Upsell Campaign

This notebook demonstrates the process for building predictive models to identify customers likely to accept upsell offers.
The steps include:
- Data loading and preparation
- Model implementation with hyperparameter tuning
 - Evaluation of models using validation metrics
- Predictions on the test dataset



# Data loading and preparation


In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, roc_auc_score, classification_report

# Load the training and test datasets from CSV files.
train_data_path = 'upsell_train_corrected.csv'
test_data_path = 'upsell_test_corrected_WITHOUT_TARGET.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path, sep=';')

We now handle missing values, encode categorical variables, and standardize numerical features.

In [None]:
# Separate features and target in the training set
X = train_df.drop(columns=['upsell'])
y = train_df['upsell']

# Handle missing values
X['income'] = X['income'].fillna(X['income'].median())
X['engagement_score'] = X['engagement_score'].fillna(X['engagement_score'].median())

test_df['income'] = test_df['income'].fillna(X['income'].median())
test_df['engagement_score'] = test_df['engagement_score'].fillna(X['engagement_score'].median())

# Encode categorical variables
categorical_columns = ['subscription_type', 'region', 'device_type']
encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    X[col] = encoders[col].fit_transform(X[col])
    test_df[col] = encoders[col].transform(test_df[col])

# Standardize numerical variables
scaler = StandardScaler()
numerical_columns = ['age', 'income', 'account_age', 'clicks_last_month',
                     'promo_clicks', 'engagement_score', 'last_login_days',
                     'previous_upsell_attempts', 'average_session_time']

X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Remove unnecessary columns from test set
test_features = test_df.drop(columns=['Id'])

# Model Implementation and Hyperparameter Tuning  
Each model is trained using the training dataset. Hyperparameters are tuned, and validation metrics such as F1-Score and AUC are calculated to assess model performance.



In [None]:
# Calculate Capture Rate
def calculate_capture_rate(y_true, y_pred_prob):
    # Create a DataFrame with true values and predicted probabilities
    results = pd.DataFrame({'y_true': y_true, 'y_pred_prob': y_pred_prob})

    # Sort by predicted probability (descending order)
    results = results.sort_values(by='y_pred_prob', ascending=False)

    # Calculate the threshold for the top 10% (0.1 proportion)
    top_10_percent = int(len(results) * 0.1)

    # Get the top 10% rows
    top_10_results = results.head(top_10_percent)

    # Calculate the capture rate as the proportion of true positives in the top 10%
    capture_rate = top_10_results['y_true'].sum() / top_10_percent

    return capture_rate

# Initialize a dictionary to store model predictions for the test set
model_predictions = {}

##**Logistic Regression**

In [None]:
logit_model = LogisticRegression(max_iter=1000, random_state=42)
logit_model.fit(X_train, y_train)

# Validation Metrics
logit_val_pred = logit_model.predict(X_val)
logit_val_prob = logit_model.predict_proba(X_val)[:, 1]
print("Logistic Regression Validation Metrics")
print(f"F1 Score: {f1_score(y_val, logit_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, logit_val_prob)}")

# Capture Rate for Logistic Regression
logit_capture_rate = calculate_capture_rate(y_val, logit_val_prob)
print(f"Logistic Regression Capture Rate: {logit_capture_rate}")

# Test Predictions
y_test_prob_logit = logit_model.predict_proba(test_features)[:, 1]
y_test_pred_logit = logit_model.predict(test_features)
model_predictions['Logistic Regression'] = (y_test_pred_logit, y_test_prob_logit)



Logistic Regression Validation Metrics
F1 Score: 0.29991980753809144
AUC: 0.7311637840317577
Logistic Regression Capture Rate: 0.644


##**K-Nearest Neighbors**

In [None]:
knn_model = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7]}
grid_knn = GridSearchCV(knn_model, knn_params, scoring='f1', cv=5)
grid_knn.fit(X_train, y_train)

# Best Model
best_knn = grid_knn.best_estimator_
knn_val_pred = best_knn.predict(X_val)
knn_val_prob = best_knn.predict_proba(X_val)[:, 1]
print("KNN Validation Metrics")
print(f"F1 Score: {f1_score(y_val, knn_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, knn_val_prob)}")

# Capture Rate for KNN
knn_capture_rate = calculate_capture_rate(y_val, knn_val_prob)
print(f"KNN Capture Rate: {knn_capture_rate}")

# Test Predictions
y_test_prob_knn = best_knn.predict_proba(test_features)[:, 1]
y_test_pred_knn = best_knn.predict(test_features)
model_predictions['KNN'] = (y_test_pred_knn, y_test_prob_knn)

KNN Validation Metrics
F1 Score: 0.5453474676089517
AUC: 0.7747786499123716
KNN Capture Rate: 0.748


## **Decision Tree & Random Forest**

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Validation Metrics
dt_val_pred = dt_model.predict(X_val)
dt_val_prob = dt_model.predict_proba(X_val)[:, 1]
print("Decision Tree Validation Metrics")
print(f"F1 Score: {f1_score(y_val, dt_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, dt_val_prob)}")

# Capture Rate for Decision Tree
dt_capture_rate = calculate_capture_rate(y_val, dt_val_prob)
print(f"Decision Tree Capture Rate: {dt_capture_rate}")

# Test Predictions
y_test_prob_dt = dt_model.predict_proba(test_features)[:, 1]
y_test_pred_dt = dt_model.predict(test_features)
model_predictions['Decision Tree'] = (y_test_pred_dt, y_test_prob_dt)

#Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Capture Rate for Random Forest
rf_capture_rate = calculate_capture_rate(y_val, rf_val_prob)
print(f"Random Forest Capture Rate: {rf_capture_rate}")

# Validation Metrics
rf_val_pred = rf_model.predict(X_val)
rf_val_prob = rf_model.predict_proba(X_val)[:, 1]
print("Random Forest Validation Metrics")
print(f"F1 Score: {f1_score(y_val, rf_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, rf_val_prob)}")

# Test Predictions
y_test_prob_rf = rf_model.predict_proba(test_features)[:, 1]
y_test_pred_rf = rf_model.predict(test_features)
model_predictions['Random Forest'] = (y_test_pred_rf, y_test_prob_rf)

Decision Tree Validation Metrics
F1 Score: 0.6034725480994838
AUC: 0.7521996801346964
Decision Tree Capture Rate: 0.586
Random Forest Capture Rate: 0.95
Random Forest Validation Metrics
F1 Score: 0.6845878136200717
AUC: 0.9207704166298922


##**Support Vector Machine (SVM)**





In [None]:
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Validation Metrics
svm_val_pred = svm_model.predict(X_val)
svm_val_prob = svm_model.predict_proba(X_val)[:, 1]
print("SVM Validation Metrics")
print(f"F1 Score: {f1_score(y_val, svm_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, svm_val_prob)}")

# Capture Rate for SVM
svm_capture_rate = calculate_capture_rate(y_val, svm_val_prob)
print(f"SVM Capture Rate: {svm_capture_rate}")

# Test Predictions
y_test_prob_svm = svm_model.predict_proba(test_features)[:, 1]
y_test_pred_svm = svm_model.predict(test_features)
model_predictions['SVM'] = (y_test_pred_svm, y_test_prob_svm)

SVM Validation Metrics
F1 Score: 0.5867549668874172
AUC: 0.8987745851595301
SVM Capture Rate: 0.936


##**Artificial Neural Network (ANN)**

In [None]:
ann_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
ann_model.fit(X_train, y_train)

# Validation Metrics
ann_val_pred = ann_model.predict(X_val)
ann_val_prob = ann_model.predict_proba(X_val)[:, 1]
print("ANN Validation Metrics")
print(f"F1 Score: {f1_score(y_val, ann_val_pred)}")
print(f"AUC: {roc_auc_score(y_val, ann_val_prob)}")

# Calculate Capture Rate for ANN
ann_capture_rate = calculate_capture_rate(y_val, ann_val_prob)
print(f"ANN Capture Rate: {ann_capture_rate}")

# Test Predictions
y_test_prob_ann = ann_model.predict_proba(test_features)[:, 1]
y_test_pred_ann = ann_model.predict(test_features)
model_predictions['ANN'] = (y_test_pred_ann, y_test_prob_ann)

ANN Validation Metrics
F1 Score: 0.7509025270758123
AUC: 0.9300503171981428
ANN Capture Rate: 0.956


#Predictions on the test dataset
We export predictions for each model to separate sheets in a single Excel file.


In [None]:
# Save predictions to Excel
output_file = 'Upsell_Predictions.xlsx'
with pd.ExcelWriter(output_file) as writer:
    for model_name, (pred_class, pred_prob) in model_predictions.items():
        results = pd.DataFrame({
            'ID': test_df['Id'],
            'Predicted Class': pred_class,
            'Predicted Probability': pred_prob
        })
        results.to_excel(writer, sheet_name=model_name, index=False)

print(f"Predictions saved to {output_file}")

Predictions saved to Upsell_Predictions.xlsx
