In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv('/content/Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [5]:
# Splitting the data
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
# Feature engineering
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [7]:
# Scaling numerical features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical_features]), columns=numerical_features)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_features]), columns=numerical_features)

In [8]:
# One-hot encoding categorical features
encoder = OneHotEncoder(sparse=False)
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))



In [9]:
# Combining scaled and encoded features
X_train_prepared = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test_prepared = pd.concat([X_test_scaled, X_test_encoded], axis=1)


In [10]:
# Training the models
models = {
    'RandomForest': RandomForestClassifier(random_state=1),
    'ExtraTrees': ExtraTreesClassifier(random_state=1),
    'XGBoost': XGBClassifier(random_state=1),
    'LightGBM': LGBMClassifier(random_state=1)
}

In [11]:
# Fit the models and evaluate
for name, model in models.items():
    model.fit(X_train_prepared, y_train)
    print(f"{name} Test Accuracy: {model.score(X_test_prepared, y_test)}")


RandomForest Test Accuracy: 0.7913413768630234
ExtraTrees Test Accuracy: 0.7672107877927609
XGBoost Test Accuracy: 0.7934705464868701
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Test Accuracy: 0.8034066713981547


In [12]:
#To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV).
# n_estimators = [50, 100, 300, 500, 1000]
# min_samples_spli

min_samples_split = [5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]
max_features = ['auto', 'sqrt', 'log2', None]

In [14]:
#Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

# New ExtraTreesClassifier with hyperparameters
new_model = ExtraTreesClassifier(min_samples_split=15, min_samples_leaf=5, max_features='auto', random_state=1)
new_model.fit(X_train_prepared, y_train)
new_accuracy = new_model.score(X_test_prepared, y_test)

# Comparison with initial model
initial_accuracy = models['ExtraTrees'].score(X_test_prepared, y_test)

print(f"Initial ExtraTreesClassifier Accuracy: {initial_accuracy}")
print(f"New ExtraTreesClassifier Accuracy: {new_accuracy}")

# Check if new accuracy is higher or lower
if new_accuracy > initial_accuracy:
    print("The new model with hyperparameter tuning has higher accuracy.")
else:
    print("The new model with hyperparameter tuning has lower accuracy.")


  warn(


Initial ExtraTreesClassifier Accuracy: 0.7672107877927609
New ExtraTreesClassifier Accuracy: 0.8062455642299503
The new model with hyperparameter tuning has higher accuracy.
