In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

In [17]:
df=pd.read_csv('telecom_customer.csv')

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## TODO:
Preprocessing:

Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.

Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.<br>

Split the data into an 80-20 train-test split with a random state of “1”.<br>

Select these features:  
`categorical` = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

`numerical` = ['tenure', 'MonthlyCharges', 'TotalCharges']<br>

In [18]:
df['TotalCharges']=df['TotalCharges'].replace(' ',np.nan).astype('float')

In [19]:
df['Churn']=df['Churn'].replace(['No','Yes'],[0,1]).astype('int')

In [20]:
X=df.drop(columns=['customerID','Churn'])
y=df['Churn']

In [21]:
df=df.fillna(0)

In [22]:
#split the training and test data set using 80-20, and a random state of 1
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=.2, random_state=1)

In [23]:
print('train data shape',X_train.shape)
print('test data shape',X_test.shape)

train data shape (5634, 19)
test data shape (1409, 19)


## Feature engineering:
The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.

The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names.

Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set. Answer the following questions:

In [24]:
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [25]:
#scale numerical values for training dataset
scaler=StandardScaler()

X_numerical=X_train[numerical]
scaler.fit(X_numerical)

scaled_X_train=scaler.transform(X_numerical)

In [26]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
               'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 
               'DeviceProtection', 'TechSupport', 'StreamingTV', 
               'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

In [27]:
#encode categorical values for testing dataset
encoder=OneHotEncoder(sparse=False)

X_categorical=X_train[categorical]
encoder.fit(X_categorical)

encoded_X_train=encoder.transform(X_categorical)

In [28]:
#convert the encoded and scaled dataset into a dataframe and concatenate them into one dataframe

numeric_train=pd.DataFrame(scaled_X_train,columns=X_numerical.columns)

# Create a DataFrame with column names
encoded_df = pd.DataFrame(encoded_X_train, columns=encoder.
                          get_feature_names_out(input_features=X_categorical.columns))


X_train_ready = pd.concat([numeric_train, encoded_df], axis=1)

X_train_ready.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.825884,-1.49753,-0.892383,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.395961,0.302996,0.388267,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.577078,0.01232,1.059525,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.577078,0.686687,1.773983,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.092777,0.186726,-0.104101,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [30]:
#fill missing values with 0
X_train_ready=X_train_ready.fillna(0)

In [31]:
#perform similar operation for X_test
X_test_numerical=X_test[numerical]

scaled_X_test=scaler.transform(X_test_numerical)

In [32]:
X_test_categorical=X_test[categorical]

encoded_X_test=encoder.transform(X_test_categorical)

In [33]:
#convert the encoded and scaled dataset into a dataframe and concatenate them into one dataframe

numeric_test=pd.DataFrame(scaled_X_test,columns=X_test_numerical.columns)

# Create a DataFrame with column names
encoded_test_df = pd.DataFrame(encoded_X_test, columns=encoder.
                          get_feature_names_out(input_features=X_test_categorical.columns))


X_test_ready = pd.concat([numeric_test, encoded_test_df], axis=1)

X_test_ready.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.355233,0.500655,0.458958,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.373437,1.249767,1.84944,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.825884,-0.657063,-0.775006,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.110981,-0.471031,-0.89609,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.90734,0.037235,-0.715125,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [35]:
#fill missing values with 0
X_test_ready=X_test_ready.fillna(0)

In [36]:
#Train a random forest

random_forest=RandomForestClassifier(random_state=1)

#fit the model
random_forest.fit(X_train_ready,y_train)

RandomForestClassifier(random_state=1)

In [37]:
#evaluate random forest on training dataset
random_forest.score(X_train_ready,y_train)

0.9980475683351083

### Q1: What is the accuracy on the test set using the random forest classifier?

In [38]:
#test accuracy
random_forest.score(X_test_ready,y_test)

0.7927608232789212

In [39]:
#train an ExtraTreesClassifier

extra_tree=ExtraTreesClassifier(random_state=1)

#fit the model
extra_tree.fit(X_train_ready,y_train)

ExtraTreesClassifier(random_state=1)

In [40]:
#evaluate on training dataset
extra_tree.score(X_train_ready,y_train)

0.9980475683351083

In [41]:
#evaluate on test dataset
extra_tree.score(X_test_ready,y_test)

0.7721788502484032

In [45]:
#train an xgbost model
xgb=xgb.XGBClassifier(random_state=1)
xgb.fit(X_train_ready, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=1, ...)

In [46]:
#evaluate on training dataset
xgb.score(X_train_ready,y_train)

0.938054668086617

### Q2: What is the accuracy on the test set using the xgboost classifier? 

In [47]:
#evaluate on test dataset
xgb.score(X_test_ready,y_test)

0.8005677785663591

In [48]:
#train an lgb model
lgb=lgb.LGBMClassifier(random_state=1)
lgb.fit(X_train_ready, y_train)

LGBMClassifier(random_state=1)

In [49]:
#evaluate on training dataset
lgb.score(X_train_ready,y_train)

0.8768193113241036

### Q3: What is the accuracy on the test set using the LGBM classifier?

In [50]:
#evaluate on test dataset
lgb.score(X_test_ready,y_test)

0.8069552874378992

### Q4

To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV). 

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

Options
N_estimators = 1000 , min_samples_split = 2 , min_samples_leaf = 8, max_features = None

N_estimators = 500 , min_samples_split = 2 , min_samples_leaf = 8, max_features = ‘log2‘

N_estimators = 300 , min_samples_split = 5 , min_samples_leaf = 6, max_features = ‘auto’

N_estimators = 1000 , min_samples_split = 9 , min_samples_leaf = 8, max_features = None

In [66]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Create an Extra Trees Classifier
extra_trees = ExtraTreesClassifier(random_state=1)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    extra_trees,  # Estimator
    hyperparameter_grid,  # Hyperparameter grid
    cv=5,  # Cross-validation with 5 folds
    n_iter=10,  # Number of parameter settings to sample
    scoring='accuracy',  # Evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    verbose=1,  # Show progress
    random_state=1  # Random seed for reproducibility
)

# Fit the RandomizedSearchCV on your data
random_search.fit(X_train_ready, y_train)

# Get the best hyperparameters
best_hyperparameters = random_search.best_params_


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [67]:
print(best_hyperparameters)

{'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


### Q5: 
Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV 
(with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [68]:
extra_tree=ExtraTreesClassifier(random_state=1,
                               n_estimators=100,
                               min_samples_split=9,
                               min_samples_leaf=8,
                               max_features='sqrt')

In [69]:
extra_tree.fit(X_train_ready,y_train)

ExtraTreesClassifier(max_features='sqrt', min_samples_leaf=8,
                     min_samples_split=9, random_state=1)

In [70]:
#score on the training dataset
extra_tree.score(X_train_ready,y_train)

0.8384806531771388

In [71]:
#score on the test dataset
extra_tree.score(X_test_ready,y_test)

0.8041163946061036

In [72]:
#there was an improvement on the test dataset. The model is no longer overfitting on the training dataset

### Q6:
Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [76]:
pd.Series(extra_tree.feature_importances_,index=X_train_ready.columns).sort_values(ascending=False).head(5)

Contract_Month-to-month        0.146798
OnlineSecurity_No              0.087070
tenure                         0.085165
Contract_Two year              0.064490
InternetService_Fiber optic    0.060689
dtype: float64