In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv('cleaned_data_telecom.csv')  

In [3]:
data_no_total = data.drop(['total_charges'], axis=1).reset_index(drop=True)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Assume `data` is the DataFrame and 'contract' is the feature where we want 'Month-to-month' to remain
features = data_no_total.columns.drop('churn')
target = 'churn'

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Separate out 'contract' and other categorical/numerical features
contract_feature = ['contract']
other_categorical_features = [col for col in X_train.select_dtypes(include=['object']).columns if col != 'contract']
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Define separate transformers for contract and other categorical features
contract_transformer = OneHotEncoder(drop=['Two year'], sparse_output=False)  # Dropping 'Two year' to keep 'Month-to-month'
other_categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)  # Default drop first for other categories

# Define the ColumnTransformer
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('contract', contract_transformer, contract_feature),
        ('cat', other_categorical_transformer, other_categorical_features)
    ]
)

val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('contract', contract_transformer, contract_feature),
        ('cat', other_categorical_transformer, other_categorical_features)
    ]
)

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('contract', contract_transformer, contract_feature),
        ('cat', other_categorical_transformer, other_categorical_features)
    ]
)

# Fit and transform the datasets
X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

# Output the shapes of the processed datasets
print("Training set shape:", X_train_preprocessed.shape)
print("Validation set shape:", X_val_preprocessed.shape)
print("Test set shape:", X_test_preprocessed.shape)


Training set shape: (4206, 29)
Validation set shape: (1402, 29)
Test set shape: (1402, 29)


In [7]:
features = data_no_total.columns.drop('churn')
target = 'churn'

X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Define a preprocessor that will be fit only on the training data
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Fit the preprocessor on the training data only
train_preprocessor.fit(X_train)

# Transform the training, validation, and test sets using the same preprocessor
X_train_preprocessed = train_preprocessor.transform(X_train)
X_val_preprocessed = train_preprocessor.transform(X_val)
X_test_preprocessed = train_preprocessor.transform(X_test)

# Output the shapes of the processed datasets to confirm transformation
print("Training set shape:", X_train_preprocessed.shape)
print("Validation set shape:", X_val_preprocessed.shape)
print("Test set shape:", X_test_preprocessed.shape)

Training set shape: (4206, 29)
Validation set shape: (1402, 29)
Test set shape: (1402, 29)


In [7]:
features = data_no_total.columns.drop('churn')
target = 'churn'

# Step 1: Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Define categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Define a preprocessor for scaling and encoding
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Fit the preprocessor on the training data only
train_preprocessor.fit(X_train)

# Transform the training set, then apply SMOTE
X_train_preprocessed = train_preprocessor.transform(X_train)
X_val_preprocessed = train_preprocessor.transform(X_val)
X_test_preprocessed = train_preprocessor.transform(X_test)

# Step 4: Apply SMOTE only on the preprocessed training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

# Output the shapes to confirm transformations
print("Training set shape after SMOTE:", X_train_resampled.shape)
print("Validation set shape (untouched):", X_val_preprocessed.shape)
print("Test set shape (untouched):", X_test_preprocessed.shape)

Training set shape after SMOTE: (6182, 29)
Validation set shape (untouched): (1402, 29)
Test set shape (untouched): (1402, 29)


In [9]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

model_performance = {}
feature_importance_df = pd.DataFrame()

for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    model.fit(X_train_preprocessed, y_train)
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val, y_val_pred)
    model_performance[model_name] = accuracy
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val, y_val_pred))
    if model_name in ['Random Forest', 'Decision Tree', 'Gradient Boosting']:
        importances = model.feature_importances_
        feature_importance_df[model_name] = importances
if not feature_importance_df.empty:
    feature_names = train_preprocessor.get_feature_names_out()
    feature_importance_df['Feature'] = feature_names
    feature_importance_df.set_index('Feature', inplace=True)

print("\nModel Performance Summary:")
print(model_performance)

if not feature_importance_df.empty:
    print("\nFeature Importances Summary:")
    print(feature_importance_df)

Training and evaluating Random Forest...
Random Forest Validation Accuracy: 79.96%

Random Forest Classification Report:
              precision    recall  f1-score   support

          No       0.83      0.92      0.87      1037
         Yes       0.66      0.47      0.55       365

    accuracy                           0.80      1402
   macro avg       0.75      0.69      0.71      1402
weighted avg       0.79      0.80      0.79      1402

Training and evaluating SVM...
SVM Validation Accuracy: 81.81%

SVM Classification Report:
              precision    recall  f1-score   support

          No       0.85      0.91      0.88      1037
         Yes       0.69      0.55      0.61       365

    accuracy                           0.82      1402
   macro avg       0.77      0.73      0.75      1402
weighted avg       0.81      0.82      0.81      1402

Training and evaluating Logistic Regression...
Logistic Regression Validation Accuracy: 82.03%

Logistic Regression Classification Rep

In [11]:
# List of selected features, based on their transformed names
selected_feature_names = [
    'num__tenure', 
    'num__monthly_charges', 
    'cat__internet_service_Fiber optic', 
    'contract__contract_Month-to-month', 
    'cat__contract_One year', 
    'cat__paperless_billing_Yes', 
    'cat__payment_method_Electronic check', 
    'cat__online_security_Yes', 
    'cat__dependents_Yes', 
    'cat__payment_method_Credit card (automatic)',
    'cat__gender_Male',
    'cat__partner_Yes',
    'cat__tech_support_Yes',
    'cat__online_backup_Yes',
    'cat__streaming_movies_Yes'
    
]

# Get all transformed feature names from the preprocessor
all_feature_names = train_preprocessor.get_feature_names_out()

# Find the indices of selected features in the preprocessed dataset
selected_feature_indices = [i for i, feature in enumerate(all_feature_names) if feature in selected_feature_names]


In [13]:
# Filter each preprocessed dataset to keep only selected features
X_train_selected = X_train_preprocessed[:, selected_feature_indices]
X_val_selected = X_val_preprocessed[:, selected_feature_indices]
X_test_selected = X_test_preprocessed[:, selected_feature_indices]

# Check shapes to confirm
print("Filtered Training set shape:", X_train_selected.shape)
print("Filtered Validation set shape:", X_val_selected.shape)
print("Filtered Test set shape:", X_test_selected.shape)


Filtered Training set shape: (4206, 14)
Filtered Validation set shape: (1402, 14)
Filtered Test set shape: (1402, 14)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Dictionary to store model performance
model_performance_selected = {}

for model_name, model in models.items():
    print(f"Training and evaluating {model_name} with selected features...")
    
    # Train the model
    model.fit(X_train_selected, y_train)
    
    # Evaluate on validation set
    y_val_pred = model.predict(X_val_selected)
    accuracy = accuracy_score(y_val, y_val_pred)
    model_performance_selected[model_name] = accuracy
    print(f"{model_name} Validation Accuracy with selected features: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report with selected features:")
    print(classification_report(y_val, y_val_pred))

print("\nModel Performance Summary with Selected Features:")
print(model_performance_selected)


Training and evaluating Random Forest with selected features...
Random Forest Validation Accuracy with selected features: 77.60%

Random Forest Classification Report with selected features:
              precision    recall  f1-score   support

          No       0.82      0.89      0.86      1037
         Yes       0.59      0.44      0.50       365

    accuracy                           0.78      1402
   macro avg       0.71      0.67      0.68      1402
weighted avg       0.76      0.78      0.76      1402

Training and evaluating SVM with selected features...
SVM Validation Accuracy with selected features: 80.74%

SVM Classification Report with selected features:
              precision    recall  f1-score   support

          No       0.84      0.91      0.88      1037
         Yes       0.67      0.51      0.58       365

    accuracy                           0.81      1402
   macro avg       0.76      0.71      0.73      1402
weighted avg       0.80      0.81      0.80      14

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Data Preparation - Split data
features = data_no_total.columns.drop('churn')
target = 'churn'
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Define preprocessors for each set
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Step 4: Preprocess the data
X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

# Step 5: Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode "Yes"/"No" as 1/0
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Step 6: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_encoded)

# Verify class distribution after SMOTE
print("Class distribution after SMOTE:", np.bincount(y_train_resampled))

# Step 7: Define models to evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Step 8: Train and evaluate each model on the resampled data
model_performance = {}
feature_importance_df = pd.DataFrame()

for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    # Fit model on the SMOTE-resampled training data
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict and evaluate on the original validation set
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    model_performance[model_name] = accuracy
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))
    
    # Capture feature importances for applicable models
    if model_name in ['Random Forest', 'Decision Tree', 'Gradient Boosting']:
        importances = model.feature_importances_
        feature_importance_df[model_name] = importances

# Add feature names to feature importance DataFrame if applicable
if not feature_importance_df.empty:
    feature_names = train_preprocessor.get_feature_names_out()
    feature_importance_df['Feature'] = feature_names
    feature_importance_df.set_index('Feature', inplace=True)

# Summary
print("\nModel Performance Summary:")
print(model_performance)

if not feature_importance_df.empty:
    print("\nFeature Importances Summary:")
    print(feature_importance_df)


Class distribution after SMOTE: [3091 3091]
Training and evaluating Random Forest...
Random Forest Validation Accuracy: 78.60%

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      1037
           1       0.59      0.56      0.58       365

    accuracy                           0.79      1402
   macro avg       0.72      0.71      0.72      1402
weighted avg       0.78      0.79      0.78      1402

Training and evaluating SVM...
SVM Validation Accuracy: 69.61%

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      1037
           1       0.45      0.82      0.58       365

    accuracy                           0.70      1402
   macro avg       0.68      0.74      0.67      1402
weighted avg       0.79      0.70      0.71      1402

Training and evaluating Logistic Regression...
Logistic Regression Validation Accuracy: 74

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, classification_report, accuracy_score

# Fit the RandomForest model on resampled training data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Get predicted probabilities on the validation set
y_val_probs = rf_model.predict_proba(X_val_preprocessed)[:, 1]  # Probabilities for class 1

# Determine optimal threshold using the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_val_encoded, y_val_probs)
# Set a custom threshold, e.g., 0.3, to increase recall
custom_threshold = 0.3
y_val_pred_custom = (y_val_probs >= custom_threshold).astype(int)

# Evaluate the model with the custom threshold
print(f"Validation Accuracy with threshold {custom_threshold}: {accuracy_score(y_val_encoded, y_val_pred_custom) * 100:.2f}%")
print("Classification Report with Custom Threshold:")
print(classification_report(y_val_encoded, y_val_pred_custom))


Validation Accuracy with threshold 0.3: 71.61%
Classification Report with Custom Threshold:
              precision    recall  f1-score   support

           0       0.89      0.71      0.79      1037
           1       0.47      0.75      0.58       365

    accuracy                           0.72      1402
   macro avg       0.68      0.73      0.68      1402
weighted avg       0.78      0.72      0.73      1402



In [11]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Use RandomizedSearchCV for Random Forest with cross-validation
grid_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=20,
    scoring='recall',  # Focusing on recall for the churn class
    cv=3,
    random_state=42,
    n_jobs=-1
)

# Fit the grid search on the resampled training data
grid_search_rf.fit(X_train_resampled, y_train_resampled)

# Output the best parameters and best recall score
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Recall Score from Cross-Validation:", grid_search_rf.best_score_)

# Evaluate best model on validation set with custom threshold if needed
best_rf_model = grid_search_rf.best_estimator_
y_val_probs_rf = best_rf_model.predict_proba(X_val_preprocessed)[:, 1]
y_val_pred_rf_custom = (y_val_probs_rf >= custom_threshold).astype(int)  # Using the same custom threshold from above

print("Validation Classification Report with Tuned Random Forest:")
print(classification_report(y_val_encoded, y_val_pred_rf_custom))


Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'class_weight': 'balanced_subsample'}
Best Recall Score from Cross-Validation: 0.8622307182833772
Validation Classification Report with Tuned Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.64      0.76      1037
           1       0.46      0.86      0.60       365

    accuracy                           0.70      1402
   macro avg       0.69      0.75      0.68      1402
weighted avg       0.81      0.70      0.72      1402



In [13]:
from sklearn.ensemble import VotingClassifier

# Define individual models with tuned hyperparameters if available
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
log_reg_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Create a VotingClassifier ensemble using soft voting and custom weights
ensemble_model = VotingClassifier(
    estimators=[('rf', rf_model), ('lr', log_reg_model), ('gb', gb_model)],
    voting='soft',  # 'soft' voting averages predicted probabilities
    weights=[2, 1, 2]  # Adjust these weights as needed based on performance
)

# Fit the ensemble model on the SMOTE-resampled training data
ensemble_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the ensemble model on the validation set
y_val_probs_ensemble = ensemble_model.predict_proba(X_val_preprocessed)[:, 1]
custom_threshold = 0.3
y_val_pred_ensemble = (y_val_probs_ensemble >= custom_threshold).astype(int)

print("Ensemble Validation Accuracy:", accuracy_score(y_val_encoded, y_val_pred_ensemble) * 100)
print("Ensemble Classification Report with Custom Threshold:")
print(classification_report(y_val_encoded, y_val_pred_ensemble))


Ensemble Validation Accuracy: 70.9700427960057
Ensemble Classification Report with Custom Threshold:
              precision    recall  f1-score   support

           0       0.93      0.66      0.77      1037
           1       0.47      0.86      0.61       365

    accuracy                           0.71      1402
   macro avg       0.70      0.76      0.69      1402
weighted avg       0.81      0.71      0.73      1402



In [15]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Define a pipeline combining SMOTE and RandomUnderSampler
resampling_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42, sampling_strategy=0.5)),  # Apply SMOTE with a target ratio
    ('undersample', RandomUnderSampler(random_state=42))  # Then apply undersampling
])

# Apply the resampling pipeline to the training data
X_train_resampled, y_train_resampled = resampling_pipeline.fit_resample(X_train_preprocessed, y_train_encoded)

# Verify the new class distribution after combined resampling
print("Class distribution after SMOTE + undersampling:", np.bincount(y_train_resampled))

# Train and evaluate models with this new balanced data
for model_name, model in models.items():
    print(f"Training and evaluating {model_name} with combined SMOTE and undersampling...")
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict and evaluate on the original validation set
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))


Class distribution after SMOTE + undersampling: [1545 1545]
Training and evaluating Random Forest with combined SMOTE and undersampling...
Random Forest Validation Accuracy: 76.82%

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.80      0.84      1037
           1       0.54      0.67      0.60       365

    accuracy                           0.77      1402
   macro avg       0.71      0.74      0.72      1402
weighted avg       0.79      0.77      0.78      1402

Training and evaluating SVM with combined SMOTE and undersampling...
SVM Validation Accuracy: 69.61%

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      1037
           1       0.45      0.82      0.58       365

    accuracy                           0.70      1402
   macro avg       0.68      0.74      0.67      1402
weighted avg       0.79      0.70      0.71      1402

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import ADASYN

# Step 1: Data Preparation - Split data
features = data_no_total.columns.drop('churn')
target = 'churn'
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Define preprocessors for each set
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Step 4: Preprocess the data
X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

# Step 5: Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode "Yes"/"No" as 1/0
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Step 6: Apply SMOTE to the training data
ada = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = ada.fit_resample(X_train_preprocessed, y_train)


# Verify class distribution after SMOTE
print("Class distribution after SMOTE:", np.bincount(y_train_resampled))

# Step 7: Define models to evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Step 8: Train and evaluate each model on the resampled data
model_performance = {}
feature_importance_df = pd.DataFrame()

for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    # Fit model on the SMOTE-resampled training data
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict and evaluate on the original validation set
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    model_performance[model_name] = accuracy
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))
    
    # Capture feature importances for applicable models
    if model_name in ['Random Forest', 'Decision Tree', 'Gradient Boosting']:
        importances = model.feature_importances_
        feature_importance_df[model_name] = importances

# Add feature names to feature importance DataFrame if applicable
if not feature_importance_df.empty:
    feature_names = train_preprocessor.get_feature_names_out()
    feature_importance_df['Feature'] = feature_names
    feature_importance_df.set_index('Feature', inplace=True)

# Summary
print("\nModel Performance Summary:")
print(model_performance)

if not feature_importance_df.empty:
    print("\nFeature Importances Summary:")
    print(feature_importance_df)


ValueError: invalid literal for int() with base 10: 'Yes'

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import ADASYN

# Step 1: Data Preparation - Split data
features = data_no_total.columns.drop('churn')
target = 'churn'
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Define preprocessors for each set
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Step 4: Preprocess the data
X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

# Step 5: Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode "Yes"/"No" as 1/0
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Step 6: Apply SMOTE to the training data
ada = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = ada.fit_resample(X_train_preprocessed, y_train_encoded)
# Verify class distribution after SMOTE
print("Class distribution after SMOTE:", np.bincount(y_train_resampled))

# Step 7: Define models to evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Step 8: Train and evaluate each model on the resampled data
model_performance = {}
feature_importance_df = pd.DataFrame()

for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    # Fit model on the SMOTE-resampled training data
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict and evaluate on the original validation set
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    model_performance[model_name] = accuracy
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))
    
    # Capture feature importances for applicable models
    if model_name in ['Random Forest', 'Decision Tree', 'Gradient Boosting']:
        importances = model.feature_importances_
        feature_importance_df[model_name] = importances

# Add feature names to feature importance DataFrame if applicable
if not feature_importance_df.empty:
    feature_names = train_preprocessor.get_feature_names_out()
    feature_importance_df['Feature'] = feature_names
    feature_importance_df.set_index('Feature', inplace=True)

# Summary
print("\nModel Performance Summary:")
print(model_performance)

if not feature_importance_df.empty:
    print("\nFeature Importances Summary:")
    print(feature_importance_df)


Class distribution after SMOTE: [3091 3109]
Training and evaluating Random Forest...
Random Forest Validation Accuracy: 78.53%

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1037
           1       0.60      0.55      0.57       365

    accuracy                           0.79      1402
   macro avg       0.72      0.71      0.71      1402
weighted avg       0.78      0.79      0.78      1402

Training and evaluating SVM...
SVM Validation Accuracy: 69.61%

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      1037
           1       0.45      0.82      0.58       365

    accuracy                           0.70      1402
   macro avg       0.68      0.74      0.67      1402
weighted avg       0.79      0.70      0.71      1402

Training and evaluating Logistic Regression...
Logistic Regression Validation Accuracy: 72

In [13]:
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define and fit the EasyEnsembleClassifier
model = EasyEnsembleClassifier(n_estimators=100, random_state=42)
model.fit(X_train_preprocessed, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val_preprocessed)

# Evaluate the model
print("EasyEnsemble Validation Accuracy:", accuracy_score(y_val, y_val_pred) * 100)
print("EasyEnsemble Classification Report:")
print(classification_report(y_val, y_val_pred))


EasyEnsemble Validation Accuracy: 75.320970042796
EasyEnsemble Classification Report:
              precision    recall  f1-score   support

          No       0.91      0.74      0.82      1037
         Yes       0.52      0.80      0.63       365

    accuracy                           0.75      1402
   macro avg       0.72      0.77      0.72      1402
weighted avg       0.81      0.75      0.77      1402

