In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

In [3]:
customer_id = list(range(10000))
age = np.random.randint(low=18, high=100, size=len(customer_id))
gender = np.random.choice(['Male', 'Female'], size=len(customer_id))
location = np.random.choice(['Urban', 'Suburban', 'Rural'], size=len(customer_id))
income = np.random.normal(loc=50000, scale=20000, size=len(customer_id))
credit_score = np.random.randint(low=300, high=850, size=len(customer_id))

customer_df = pd.DataFrame({'customer_id': customer_id, 'age': age, 'gender': gender, 'location': location, 'income': income, 'credit_score': credit_score})


In [4]:
monthly_usage = np.random.normal(loc=500, scale=200, size=len(customer_id))
average_session_time = np.random.normal(loc=10, scale=2, size=len(customer_id))
num_sessions_per_month = np.random.normal(loc=50, scale=10, size=len(customer_id))

usage_df = pd.DataFrame({'customer_id': customer_id, 'monthly_usage': monthly_usage, 'average_session_time': average_session_time, 'num_sessions_per_month': num_sessions_per_month})


In [5]:
customer_service_calls = np.random.randint(low=0, high=10, size=len(customer_id))
churn = np.random.binomial(1, 0.1, size=len(customer_id))



In [6]:
service_df = pd.DataFrame({'customer_id': customer_id, 'customer_service_calls': customer_service_calls, 'churn': churn})


In [7]:
df = pd.merge(customer_df, usage_df, on='customer_id')
df = pd.merge(df, service_df, on='customer_id')

In [8]:
df['total_usage_time'] = df['monthly_usage'] * df['num_sessions_per_month']


In [9]:
df = pd.get_dummies(df, columns=['gender', 'location'])



In [14]:
X = df.drop(['customer_id', 'churn'], axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [15]:
ros = RandomOverSampler(random_state=42)
pipeline = Pipeline([('ros', ros), ('xgb', XGBClassifier())])
params = {
    'xgb__learning_rate': [0.01, 0.1, 1],
    'xgb__max_depth': [3, 5, 7],
    'xgb__n_estimators': [50, 100, 150],
    'xgb__scale_pos_weight': [1, 3, 5]
}

clf = GridSearchCV(pipeline, param_grid=params, scoring='f1', cv=5, n_jobs=-1)
clf.fit(X_train, y_train)

In [16]:
y_pred = clf.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.02      0.03      2700
           1       0.10      0.97      0.18       300

    accuracy                           0.11      3000
   macro avg       0.47      0.49      0.11      3000
weighted avg       0.77      0.11      0.05      3000

F1 Score: 0.17940813810110975
