In [12]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
dataset = pd.read_parquet('../data/processed/Telco_customer_churn_ML.parquet')

In [14]:
target = ['churn_value']

columns_to_drop = [
    'customer_id', 'count', 'country', 'state', 
    'city', 'zip_code', 'lat_long', 'latitude', 
    'longitude','churn_label', 'churn_reason'
]

categorical_features = [
    'gender', 'senior_citizen', 'partner', 'dependents',
    'phone_service', 'multiple_lines', 'internet_service',
    'online_security', 'online_backup', 'device_protection',
    'tech_support', 'streaming_tv', 'streaming_movies',
    'contract', 'payment_method'
]

numerical_features = [
    'tenure_months', 'monthly_charges', 'total_charges', 'cltv', 'churn_score'
]   

In [None]:
dataset = dataset.dropna(subset=['total_charges'])
dataset = dataset.drop(columns=columns_to_drop)

In [None]:
dataset

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure_months,phone_service,multiple_lines,internet_service,online_security,online_backup,...,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn_value,churn_score,cltv,tenure_bin
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,86,3239,"(-0.072, 18.0]"
1,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,1,67,2701,"(-0.072, 18.0]"
2,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,...,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1,86,5372,"(-0.072, 18.0]"
3,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,...,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,1,84,5003,"(18.0, 36.0]"
4,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,...,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,1,89,5340,"(36.0, 54.0]"


In [17]:
X = dataset.drop(target, axis=1)
y = dataset[target].squeeze()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
pipeline = joblib.load('../models/best_ImbPipeline.pkl')

In [20]:
y_pred = pipeline.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1033
           1       0.84      0.89      0.86       374

    accuracy                           0.92      1407
   macro avg       0.90      0.91      0.91      1407
weighted avg       0.93      0.92      0.93      1407

[[968  65]
 [ 41 333]]


prediction (data generated by AI)

In [23]:
new_data = pd.read_csv('../data/processed/new_customers.csv')

In [None]:
new_preds = pipeline.predict(new_data)
new_probs = pipeline.predict_proba(new_data)[:, 1] 

In [None]:
new_data_with_preds = new_data.copy()
new_data_with_preds['churn_prediction'] = new_preds
new_data_with_preds['churn_probability'] = new_probs

In [None]:
print(new_data_with_preds[['churn_prediction', 'churn_probability']].head())

   churn_prediction  churn_probability
0                 1           0.790000
1                 0           0.026667
2                 0           0.003333
3                 0           0.416667
4                 0           0.003333


In [28]:
new_data_with_preds.to_csv('../reports/predictions.csv')