In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Simulate Dataset
np.random.seed(42)
n_samples = 500

data = {
    'age': np.random.randint(18, 70, size=n_samples),
    'gender': np.random.choice(['Male', 'Female'], size=n_samples),
    'tenure': np.random.randint(1, 60, size=n_samples),  # months
    'monthly_usage': np.random.normal(20, 5, size=n_samples).round(2),  # hours
    'contract_type': np.random.choice(['Monthly', 'Yearly'], size=n_samples),
    'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Bank Transfer'], size=n_samples),
    'churn': np.random.choice([0, 1], size=n_samples, p=[0.75, 0.25])  # 25% churn rate
}

df = pd.DataFrame(data)

# 2. Preprocessing
le_gender = LabelEncoder()
le_contract = LabelEncoder()
le_payment = LabelEncoder()

df['gender'] = le_gender.fit_transform(df['gender'])
df['contract_type'] = le_contract.fit_transform(df['contract_type'])
df['payment_method'] = le_payment.fit_transform(df['payment_method'])

# 3. Features & Target
X = df.drop('churn', axis=1)
y = df['churn']

# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train Models

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)  # Tree models don't need scaling
y_pred_rf = rf.predict(X_test)

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# 7. Evaluation
print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr))

print("=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))

print("=== Gradient Boosting ===")
print(classification_report(y_test, y_pred_gb))

# 8. Accuracy Comparison
print("Accuracy Scores:")
print("Logistic Regression:", accuracy_score(y_test, y_pred_lr))
print("Random Forest:", accuracy_score(y_test, y_pred_rf))
print("Gradient Boosting:", accuracy_score(y_test, y_pred_gb))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        74
           1       0.00      0.00      0.00        26

    accuracy                           0.74       100
   macro avg       0.37      0.50      0.43       100
weighted avg       0.55      0.74      0.63       100

=== Random Forest ===
              precision    recall  f1-score   support

           0       0.76      0.97      0.85        74
           1       0.60      0.12      0.19        26

    accuracy                           0.75       100
   macro avg       0.68      0.54      0.52       100
weighted avg       0.72      0.75      0.68       100

=== Gradient Boosting ===
              precision    recall  f1-score   support

           0       0.76      0.97      0.85        74
           1       0.60      0.12      0.19        26

    accuracy                           0.75       100
   macro avg       0.68      0.54      0.52       10

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
