In [None]:
import pandas as pd
#Load the dataset
df = pd.read_csv('C:/Users/anjan/OneDrive/Desktop/MSC Project reference papers/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.shape

In [None]:
#Checking null values
df.isnull().head()

In [None]:
df.isnull().sum()

In [None]:
#Checking duplicates
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
#Type conversion
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')
df[['SeniorCitizen', 'TotalCharges']].dtypes

In [None]:
# Checking if there are any missing values were created
df['TotalCharges'].isnull().sum()

In [None]:
#Dropping the rows where the Column 'TotalCharges' has null values
df = df.dropna(subset=['TotalCharges'])
df['TotalCharges'].isnull().sum()

In [None]:
#Dropping the column 'CustomerID' as its not useful
df = df.drop('customerID', axis=1)
df.head()


In [None]:
#Finding categorical columns
df.select_dtypes(include=['object', 'category']).columns


In [None]:
#One-hot encoding to change the categorical columns to Binary
df_model = pd.get_dummies(df, drop_first=True)
df_model.head()

In [None]:
#train-Test Split
from sklearn.model_selection import train_test_split

X = df_model.drop('Churn_Yes', axis=1)  # All features
y = df_model['Churn_Yes']              # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
y.value_counts(normalize=True)

In [None]:
# from collections import Counter
# from imblearn.over_sampling import SMOTE
# print('Original dataset shape %s' % Counter(y_train))
# sm = SMOTE(random_state=42)
# X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
# print('Resampled dataset shape %s' % Counter(y_train_sm))

In [None]:
#Installing XgBoost
!pip install xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train
xgb.fit(X_train, y_train)

# Predict
y_pred = xgb.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# #SMOTE:# Initialize model
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# # Train
# xgb.fit(X_train_sm, y_train_sm)

# # Predict
# y_pred = xgb.predict(X_test)

# # Evaluate
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

In [None]:
# import numpy as np
# importance_scores = xgb.feature_importances_
# indices = np.argsort(importance_scores)[::-1]
# for f in range(15):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importance_scores[indices[f]]))
# top_feature_names = [X_train.columns[i] for i in indices[:20]][::-1]

In [None]:
print(xgb.feature_names_in_)

In [None]:

# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 6))
# plt.barh(range(20), importance_scores[indices[:20][::-1]])  
# plt.yticks(range(20), top_feature_names)
# plt.xlabel("Importance Score")
# plt.title("Top 20 Feature Importances")
# plt.tight_layout()
# plt.show()

In [None]:
#top_features = [X_train.columns[i] for i in indices[:15]]  

# Filter datasets to keep only top features
#X_train_top = X_train[top_features]
#X_test_top = X_test[top_features]

In [None]:
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state=42)
#X_train_top_sm, y_train_sm = sm.fit_resample(X_train_top, y_train)


In [None]:
from xgboost import XGBClassifier
import numpy as np

# Step 1: Calculate scale_pos_weight from actual class imbalance
neg, pos = np.bincount(y_train)
scale = neg / pos

# Step 2: Rebuild model with scale_pos_weight
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=scale,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Step 3: Fit on original X_train (NOT SMOTE)
xgb.fit(X_train, y_train)

# Step 4: Predict and evaluate
y_pred = xgb.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
y_proba = xgb.predict_proba(X_test)[:, 1]

for thresh in [0.45, 0.5, 0.55, 0.88, 0.85, 0.80, 0.78, 0.79]:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    print(f"\nThreshold: {thresh}")
    print("Accuracy:", accuracy_score(y_test, y_pred_thresh))


In [None]:
y_pred_08 = (y_proba >= 0.80).astype(int)
print(classification_report(y_test, y_pred_08))