# Model Optimization

In this notebook, we will further optimize the performance of our selected models (Logistic Regression and SVM) using advanced techniques such as feature engineering, handling imbalanced data, ensemble methods, and further hyperparameter tuning.

In [35]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

In [2]:
# Load the preprocessed dataset
data_path = "../data/processed/cleaned_data.csv"
df = pd.read_csv(data_path)

In [3]:
# Separate features and target variable
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (5634, 19)
Testing set size: (1409, 19)


## Feature Engineering

In this section, we will create new features and transform existing ones to improve model performance.


In [5]:
# Example: Creating interaction terms
X_train['MonthlyTenureInteraction'] = X_train['MonthlyCharges'] * X_train['tenure']
X_test['MonthlyTenureInteraction'] = X_test['MonthlyCharges'] * X_test['tenure']

# Example: Transforming categorical variables using OneHotEncoder
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numeric_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'MonthlyTenureInteraction']

In [6]:
# Preprocessing pipelines for numerical and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Apply transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Handling Imbalanced Data

We will use SMOTE (Synthetic Minority Over-sampling Technique) to address class imbalance.


In [8]:
# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

# Check the distribution of the target variable after resampling
print("Distribution of target variable after SMOTE:")
print(y_train_resampled.value_counts())

found 0 physical cores < 1
  File "C:\Users\Kahla\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Distribution of target variable after SMOTE:
Churn
0    4138
1    4138
Name: count, dtype: int64


## Ensemble Methods

We will explore ensemble methods such as Random Forest and Gradient Boosting to improve model performance.


In [9]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Initialize ensemble models
ensemble_models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

In [11]:
# Function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1, y_pred

In [12]:
# Dictionary to store evaluation results
ensemble_results = {}

In [13]:
# Evaluate each ensemble model
for model_name, model in ensemble_models.items():
    accuracy, precision, recall, f1, y_pred = evaluate_model(model, X_train_resampled, y_train_resampled, X_test_transformed, y_test)
    ensemble_results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }

In [14]:
# Display evaluation results
for model_name, metrics in ensemble_results.items():
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"Confusion Matrix:\n{metrics['Confusion Matrix']}")


Random Forest Results:
Accuracy: 0.7928
Precision: 0.6134
Recall: 0.5871
F1 Score: 0.6000
Confusion Matrix:
[[898 138]
 [154 219]]

Gradient Boosting Results:
Accuracy: 0.7942
Precision: 0.5954
Recall: 0.6944
F1 Score: 0.6411
Confusion Matrix:
[[860 176]
 [114 259]]


## Further Hyperparameter Tuning

We will perform extensive hyperparameter tuning using RandomizedSearchCV for our best models.


In [16]:
# Define parameter grid for RandomizedSearchCV for Logistic Regression
param_dist_lr = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear', 'saga']
}


In [17]:

# Initialize RandomizedSearchCV for Logistic Regression
random_search_lr = RandomizedSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_dist_lr, cv=5, n_iter=50, scoring='accuracy', random_state=42, n_jobs=-1)


In [18]:

# Perform Randomized Search for Logistic Regression
random_search_lr.fit(X_train_resampled, y_train_resampled)




In [19]:

# Best parameters and best score for Logistic Regression
best_params_lr_random = random_search_lr.best_params_
best_score_lr_random = random_search_lr.best_score_

print("Best Parameters for Logistic Regression (RandomizedSearchCV):", best_params_lr_random)
print("Best Cross-Validation Accuracy for Logistic Regression (RandomizedSearchCV):", best_score_lr_random)


Best Parameters for Logistic Regression (RandomizedSearchCV): {'solver': 'liblinear', 'C': 545.5594781168514}
Best Cross-Validation Accuracy for Logistic Regression (RandomizedSearchCV): 0.7709026956813638


In [23]:
# Define parameter grid for RandomizedSearchCV for SVM
"""
param_dist_svm = {
    'C': np.logspace(-4, 4, 20),
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}
"""
# Define a reduced parameter grid for RandomizedSearchCV for SVM
param_dist_svm = {
    'C': np.logspace(-2, 2, 10),  # Reduce the range
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}


In [24]:

# Initialize RandomizedSearchCV for SVM
"""
random_search_svm = RandomizedSearchCV(SVC(random_state=42), param_dist_svm, cv=5, n_iter=50, scoring='accuracy', random_state=42, n_jobs=-1)
"""
# Initialize RandomizedSearchCV for SVM with fewer iterations and folds
random_search_svm = RandomizedSearchCV(
    SVC(random_state=42), 
    param_dist_svm, 
    cv=3,  
    n_iter=20,  
    scoring='accuracy', 
    random_state=42, 
    n_jobs=-1  
)

In [25]:

# Perform Randomized Search for SVM
"""
random_search_svm.fit(X_train_resampled, y_train_resampled)
"""
# Use a smaller subset of the training data for hyperparameter tuning
X_train_small, _, y_train_small, _ = train_test_split(X_train_resampled, y_train_resampled, train_size=0.2, random_state=42)

# Perform Randomized Search for SVM on the smaller subset
random_search_svm.fit(X_train_small, y_train_small)

In [26]:

# Best parameters and best score for SVM
best_params_svm_random = random_search_svm.best_params_
best_score_svm_random = random_search_svm.best_score_


print("Best Parameters for SVM (RandomizedSearchCV):", best_params_svm_random)
print("Best Cross-Validation Accuracy for SVM (RandomizedSearchCV):", best_score_svm_random)


Best Parameters for SVM (RandomizedSearchCV): {'kernel': 'rbf', 'gamma': 'scale', 'C': 0.5994842503189409}
Best Cross-Validation Accuracy for SVM (RandomizedSearchCV): 0.795153958108665


## Final Model Evaluation with Cross-Validation

We evaluate the final models on the test set using cross-validation to estimate their performance on unseen data. This step provides an unbiased estimate of the models' accuracy and generalization ability.


In [27]:
# Initialize final models with best parameters
final_model_lr_random = LogisticRegression(**best_params_lr_random, max_iter=1000, random_state=42)
final_model_svm_random = SVC(**best_params_svm_random, random_state=42)


In [28]:

# Cross-validation for Logistic Regression
cv_scores_lr = cross_val_score(final_model_lr_random, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy for Logistic Regression (RandomizedSearchCV):", np.mean(cv_scores_lr))


Cross-Validation Accuracy for Logistic Regression (RandomizedSearchCV): 0.7709026956813638


In [29]:

# Cross-validation for SVM
cv_scores_svm = cross_val_score(final_model_svm_random, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy for SVM (RandomizedSearchCV):", np.mean(cv_scores_svm))


Cross-Validation Accuracy for SVM (RandomizedSearchCV): 0.7873383977698966


In [30]:

# Train final models on entire training data
final_model_lr_random.fit(X_train_resampled, y_train_resampled)
final_model_svm_random.fit(X_train_resampled, y_train_resampled)


In [31]:

# Evaluate final models on test data
y_pred_final_lr_random = final_model_lr_random.predict(X_test_transformed)
y_pred_final_svm_random = final_model_svm_random.predict(X_test_transformed)


In [32]:

# Final evaluation metrics for Logistic Regression
final_accuracy_lr_random = accuracy_score(y_test, y_pred_final_lr_random)
final_precision_lr_random = precision_score(y_test, y_pred_final_lr_random)
final_recall_lr_random = recall_score(y_test, y_pred_final_lr_random)
final_f1_lr_random = f1_score(y_test, y_pred_final_lr_random)


In [33]:

# Final evaluation metrics for SVM
final_accuracy_svm_random = accuracy_score(y_test, y_pred_final_svm_random)
final_precision_svm_random = precision_score(y_test, y_pred_final_svm_random)
final_recall_svm_random = recall_score(y_test, y_pred_final_svm_random)
final_f1_svm_random = f1_score(y_test, y_pred_final_svm_random)


In [34]:

# Display final results
print("\nFinal Logistic Regression Model (RandomizedSearchCV) Results:")
print(f"Accuracy: {final_accuracy_lr_random:.4f}")
print(f"Precision: {final_precision_lr_random:.4f}")
print(f"Recall: {final_recall_lr_random:.4f}")
print(f"F1 Score: {final_f1_lr_random:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_final_lr_random)}")

print("\nFinal SVM Model (RandomizedSearchCV) Results:")
print(f"Accuracy: {final_accuracy_svm_random:.4f}")
print(f"Precision: {final_precision_svm_random:.4f}")
print(f"Recall: {final_recall_svm_random:.4f}")
print(f"F1 Score: {final_f1_svm_random:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_final_svm_random)}")



Final Logistic Regression Model (RandomizedSearchCV) Results:
Accuracy: 0.7551
Precision: 0.5239
Recall: 0.8231
F1 Score: 0.6403
Confusion Matrix:
[[757 279]
 [ 66 307]]

Final SVM Model (RandomizedSearchCV) Results:
Accuracy: 0.7630
Precision: 0.5367
Recall: 0.7641
F1 Score: 0.6305
Confusion Matrix:
[[790 246]
 [ 88 285]]


In [36]:
# Save the final model
joblib.dump(final_model_lr_random, 'final_logistic_regression_model.pkl')

print("Logistic Regression model saved successfully.")

Logistic Regression model saved successfully.
