In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_selection import RFE

# Load the data
data = pd.read_csv('data1')

# Encode categorical variables
data_encoded = pd.get_dummies(data, drop_first=True)


In [2]:
# 1. Data Preparation
X = data_encoded.drop(columns='Churn_Yes')  # Features
y = data_encoded['Churn_Yes']  # Target (Churn column encoded as 0/1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Model dictionary with increased max_iter for Logistic Regression
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),  # Start with 500, increase if needed
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Dictionary to store the evaluation metrics
model_scores = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store the results
    model_scores[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Display model performance
for model_name, scores in model_scores.items():
    print(f"Model: {model_name}")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print("\n")

Model: Logistic Regression
Accuracy: 0.8247
Precision: 0.7006
Recall: 0.5898
F1 Score: 0.6405


Model: Decision Tree
Accuracy: 0.7750
Precision: 0.5875
Recall: 0.5040
F1 Score: 0.5426


Model: Random Forest
Accuracy: 0.7921
Precision: 0.6818
Recall: 0.4021
F1 Score: 0.5059




In [4]:
# 3. Feature Selection using Recursive Feature Elimination (RFE)
# Using Decision Tree for faster RFE and increased step size
selector = RFE(DecisionTreeClassifier(), n_features_to_select=10, step=10)  # Increased step to 5
selector = selector.fit(X_train, y_train)

# Get the selected features
selected_features = X.columns[selector.support_]
print(f"Selected Features: {selected_features.tolist()}")


Selected Features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'InternetService_Fiber optic', 'OnlineBackup_Yes', 'PaperlessBilling_Yes', 'PaymentMethod_Electronic check']


In [5]:
# 4. Hyperparameter Tuning for the Best Model (e.g., Random Forest)
# Use RandomizedSearchCV for quicker tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# RandomizedSearchCV for faster hyperparameter tuning
random_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', n_iter=10, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best parameters from Random Search
print(f"Best Parameters: {random_search.best_params_}")

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)

print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_pred_best))

# Model performance for the best model
best_model_scores = {
    'Accuracy': accuracy_score(y_test, y_pred_best),
    'Precision': precision_score(y_test, y_pred_best),
    'Recall': recall_score(y_test, y_pred_best),
    'F1 Score': f1_score(y_test, y_pred_best)
}

print(f"Best Model Performance:\n{best_model_scores}")

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}

Classification Report for Best Model:
              precision    recall  f1-score   support

       False       0.82      0.94      0.87      1036
        True       0.70      0.42      0.52       373

    accuracy                           0.80      1409
   macro avg       0.76      0.68      0.70      1409
weighted avg       0.79      0.80      0.78      1409

Best Model Performance:
{'Accuracy': 0.7984386089425124, 'Precision': 0.6995515695067265, 'Recall': 0.41823056300268097, 'F1 Score': 0.5234899328859061}
