In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training data
train_data = pd.read_csv("../data/cell2celltrain.csv")

# Load the holdout data
holdout_data = pd.read_csv("../data/cell2cellholdout.csv")

# Remove customer_id
train_data = train_data.drop('CustomerID', axis=1)
holdout_data = holdout_data.drop('CustomerID', axis=1)

# Handle missing values BEFORE Label Encoding
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].fillna('Unknown')
        holdout_data[col] = holdout_data[col].fillna('Unknown')
    else:
        train_data[col] = train_data[col].fillna(train_data[col].mean())
        holdout_data[col] = holdout_data[col].fillna(holdout_data[col].mean())

# Convert categorical features to numerical features
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        le = LabelEncoder()
        # Fit LabelEncoder on combined data
        le.fit(pd.concat([train_data[col], holdout_data[col]], axis=0))
        train_data[col] = le.transform(train_data[col])
        holdout_data[col] = le.transform(holdout_data[col])

# Feature Engineering
# Total call duration (assuming call duration columns exist)
train_data['TotalCallDuration'] = train_data[[col for col in train_data.columns if 'CallDuration' in col]].sum(axis=1)
holdout_data['TotalCallDuration'] = holdout_data[[col for col in holdout_data.columns if 'CallDuration' in col]].sum(axis=1)

# Total data usage (assuming data usage columns exist)
train_data['TotalDataUsage'] = train_data[[col for col in train_data.columns if 'DataUsage' in col]].sum(axis=1)
holdout_data['TotalDataUsage'] = holdout_data[[col for col in holdout_data.columns if 'DataUsage' in col]].sum(axis=1)

# Average call duration
train_data['AvgCallDuration'] = train_data['TotalCallDuration'] / len([col for col in train_data.columns if 'CallDuration' in col])
holdout_data['AvgCallDuration'] = holdout_data['TotalCallDuration'] / len([col for col in holdout_data.columns if 'CallDuration' in col])

# Average data usage
train_data['AvgDataUsage'] = train_data['TotalDataUsage'] / len([col for col in train_data.columns if 'DataUsage' in col])
holdout_data['AvgDataUsage'] = holdout_data['TotalDataUsage'] / len([col for col in holdout_data.columns if 'DataUsage' in col])

# Number of calls (assuming call number columns exist)
train_data['NumCalls'] = train_data[[col for col in train_data.columns if 'NumCalls' in col]].sum(axis=1)
holdout_data['NumCalls'] = holdout_data[[col for col in holdout_data.columns if 'NumCalls' in col]].sum(axis=1)

# Number of data sessions (assuming data session columns exist)
train_data['NumDataSessions'] = train_data[[col for col in train_data.columns if 'DataSession' in col]].sum(axis=1)
holdout_data['NumDataSessions'] = holdout_data[[col for col in holdout_data.columns if 'DataSession' in col]].sum(axis=1)

# New Features
train_data['AvgMonthlyIncome'] = train_data['MonthlyRevenue'] - train_data['TotalRecurringCharge']
holdout_data['AvgMonthlyIncome'] = holdout_data['MonthlyRevenue'] - holdout_data['TotalRecurringCharge']

train_data['TenureInMonths'] = train_data['MonthsInService']
holdout_data['TenureInMonths'] = holdout_data['MonthsInService']

train_data['PctDroppedCalls'] = train_data['DroppedCalls'] / (train_data['MonthlyMinutes'] + 1e-9)
holdout_data['PctDroppedCalls'] = holdout_data['DroppedCalls'] / (holdout_data['MonthlyMinutes'] + 1e-9)

train_data['PctBlockedCalls'] = train_data['BlockedCalls'] / (train_data['MonthlyMinutes'] + 1e-9)
holdout_data['PctBlockedCalls'] = holdout_data['BlockedCalls'] / (holdout_data['MonthlyMinutes'] + 1e-9)

train_data['RatioPeakToOffPeakCalls'] = train_data['PeakCallsInOut'] / (train_data['OffPeakCallsInOut'] + 1e-9)
holdout_data['RatioPeakToOffPeakCalls'] = holdout_data['PeakCallsInOut'] / (holdout_data['OffPeakCallsInOut'] + 1e-9)

train_data['AvgCallPrice'] = train_data['MonthlyRevenue'] / (train_data['MonthlyMinutes'] + 1e-9)
holdout_data['AvgCallPrice'] = holdout_data['MonthlyRevenue'] / (holdout_data['MonthlyMinutes'] + 1e-9)

train_data['TotalCustomerCareInteractions'] = train_data['CustomerCareCalls'] + train_data['DirectorAssistedCalls']
holdout_data['TotalCustomerCareInteractions'] = holdout_data['CustomerCareCalls'] + holdout_data['DirectorAssistedCalls']

# Split the training data into training and validation sets
X = train_data.drop('Churn', axis=1)
y = train_data['Churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
numerical_cols = X_train.select_dtypes(include=['number']).columns
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

# Train a model without the new features
X_train_base = X_train.drop(['TotalCallDuration', 'TotalDataUsage', 'AvgCallDuration', 'AvgDataUsage', 'NumCalls', 'NumDataSessions',
                            'AvgMonthlyIncome', 'TenureInMonths', 'PctDroppedCalls', 'PctBlockedCalls',
                            'RatioPeakToOffPeakCalls', 'AvgCallPrice', 'TotalCustomerCareInteractions'], axis=1, errors='ignore')
X_val_base = X_val.drop(['TotalCallDuration', 'TotalDataUsage', 'AvgCallDuration', 'AvgDataUsage', 'NumCalls', 'NumDataSessions',
                        'AvgMonthlyIncome', 'TenureInMonths', 'PctDroppedCalls', 'PctBlockedCalls',
                        'RatioPeakToOffPeakCalls', 'AvgCallPrice', 'TotalCustomerCareInteractions'], axis=1, errors='ignore')

model_base = XGBClassifier(random_state=42)
model_base.fit(X_train_base, y_train)

# Make predictions on the validation set
y_pred_base = model_base.predict(X_val_base)

# Evaluate the model
accuracy_base = accuracy_score(y_val, y_pred_base)
report_base = classification_report(y_val, y_pred_base)

print("\nModel without new features:")
print(f"Accuracy: {accuracy_base}")
print(f"Classification Report:\n{report_base}")

# Train a model with the new features
model_new = XGBClassifier(random_state=42)
model_new.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_new = model_new.predict(X_val)

# Evaluate the model
accuracy_new = accuracy_score(y_val, y_pred_new)
report_new = classification_report(y_val, y_pred_new)

print("\nModel with new features:")
print(f"Accuracy: {accuracy_new}")
print(f"Classification Report:\n{report_new}")


Model without new features:
Accuracy: 0.7198824681684622
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.92      0.82      7308
           1       0.52      0.21      0.30      2902

    accuracy                           0.72     10210
   macro avg       0.63      0.57      0.56     10210
weighted avg       0.68      0.72      0.68     10210




Model with new features:
Accuracy: 0.7199804113614103
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.92      0.83      7308
           1       0.52      0.21      0.29      2902

    accuracy                           0.72     10210
   macro avg       0.63      0.56      0.56     10210
weighted avg       0.68      0.72      0.67     10210

