In [102]:
# 1. Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [103]:
# 2. Load and merge datasets

telco_demog = pd.read_csv('telecom_demographics.csv')
telco_usage = pd.read_csv('telecom_usage.csv')

churn_df = telco_demog.merge(telco_usage, on='customer_id')

In [104]:
# 3. Define features and target

numeric_features = ['age', 'num_dependents', 'estimated_salary',
                    'calls_made', 'sms_sent', 'data_used']

categorical_features = ['telecom_partner', 'gender', 'state', 'city', 'registration_event']

X = churn_df[numeric_features + categorical_features]
y = churn_df['churn']

In [105]:
# 4. Train-test split (stratify to keep churn ratio)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [106]:
# 5. Preprocessing pipeline

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [107]:
# 6. Modeling pipelines

logreg_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

rf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [108]:
# 7. Train models

logreg_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

In [109]:
# 8. Predictions

logreg_pred = logreg_pipeline.predict(X_test)
rf_pred = rf_pipeline.predict(X_test)

In [110]:
# 9. Evaluate models

logreg_acc = accuracy_score(y_test, logreg_pred)
rf_acc = accuracy_score(y_test, rf_pred)

higher_accuracy = "LogisticRegression" if logreg_acc > rf_acc else "RandomForest"

print("Logistic Regression accuracy:", logreg_acc)
print("Random Forest accuracy:", rf_acc)
print("Higher accuracy model:", higher_accuracy)

print("\nLogistic Regression classification report:\n", classification_report(y_test, logreg_pred))
print("\nRandom Forest classification report:\n", classification_report(y_test, rf_pred))

Logistic Regression accuracy: 0.7938461538461539
Random Forest accuracy: 0.796923076923077
Higher accuracy model: RandomForest

Logistic Regression classification report:
               precision    recall  f1-score   support

           0       0.80      0.99      0.89      1039
           1       0.00      0.00      0.00       261

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.64      0.79      0.71      1300


Random Forest classification report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1039
           1       0.00      0.00      0.00       261

    accuracy                           0.80      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.64      0.80      0.71      1300

