In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
ds = load_dataset("aai510-group1/telco-customer-churn")

In [3]:
ds['train']

Dataset({
    features: ['Age', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges', 'Churn', 'Churn Category', 'Churn Reason', 'Churn Score', 'City', 'CLTV', 'Contract', 'Country', 'Customer ID', 'Customer Status', 'Dependents', 'Device Protection Plan', 'Gender', 'Internet Service', 'Internet Type', 'Lat Long', 'Latitude', 'Longitude', 'Married', 'Monthly Charge', 'Multiple Lines', 'Number of Dependents', 'Number of Referrals', 'Offer', 'Online Backup', 'Online Security', 'Paperless Billing', 'Partner', 'Payment Method', 'Phone Service', 'Population', 'Premium Tech Support', 'Quarter', 'Referred a Friend', 'Satisfaction Score', 'Senior Citizen', 'State', 'Streaming Movies', 'Streaming Music', 'Streaming TV', 'Tenure in Months', 'Total Charges', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Refunds', 'Total Revenue', 'Under 30', 'Unlimited Data', 'Zip Code'],
    num_rows: 4225
})

In [4]:
pd.DataFrame(ds['train']).isna().sum()

Age                                     0
Avg Monthly GB Download                 0
Avg Monthly Long Distance Charges       0
Churn                                   0
Churn Category                       3104
Churn Reason                         3104
Churn Score                             0
City                                    0
CLTV                                    0
Contract                                0
Country                                 0
Customer ID                             0
Customer Status                         0
Dependents                              0
Device Protection Plan                  0
Gender                                  0
Internet Service                        0
Internet Type                         886
Lat Long                                0
Latitude                                0
Longitude                               0
Married                                 0
Monthly Charge                          0
Multiple Lines                    

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train_df = pd.DataFrame(ds["train"])
test_df  = pd.DataFrame(ds["test"])

# drop useless / leakage columns
drop_cols = ["Customer ID", "Churn Category", "Churn Reason", "Lat Long","Customer Status", "Churn Score", "Satisfaction Score"]
train_df = train_df.drop(columns=drop_cols)
test_df  = test_df.drop(columns=drop_cols)

# fill missing values
train_df = train_df.fillna("None")
test_df  = test_df.fillna("None")

# encode categoricals safely (handles unseen labels)
cat_cols = train_df.select_dtypes(include="object").columns

for col in cat_cols:
    all_vals = pd.concat([train_df[col], test_df[col]], axis=0).astype(str).unique()
    mapping = {v: i for i, v in enumerate(all_vals)}
    train_df[col] = train_df[col].astype(str).map(mapping)
    test_df[col]  = test_df[col].astype(str).map(mapping)

X_train = train_df.drop("Churn", axis=1)
y_train = train_df["Churn"]
X_test  = test_df.drop("Churn", axis=1)
y_test  = test_df["Churn"]

# model
model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

# evaluate ON TEST (NOT TRAIN)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8601845280340668
[[959  76]
 [121 253]]
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1035
           1       0.77      0.68      0.72       374

    accuracy                           0.86      1409
   macro avg       0.83      0.80      0.81      1409
weighted avg       0.86      0.86      0.86      1409

