In [2]:
import pandas as pd                                                   # to load csv file and data manipulation 
from sklearn.model_selection import train_test_split                  # to split data for training and testiing
from sklearn.linear_model import LogisticRegression                   # model for classificiation
from sklearn.metrics import accuracy_score, classification_report     # performance metrices for classification, logistic regression
from sklearn.preprocessing import StandardScaler                      # helps to scale all the feature value to similar range
from sklearn.model_selection import GridSearchCV                      # helps in training with different hyperparameter, hyperparameter tuning
import joblib


In [3]:
df = pd.read_csv('Telco-Customer-Churn.csv') #load customer-churn dataset
df.drop('customerID', axis=1, inplace=True) #drop customerID, customerID not useful in prediction
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') #convert object type to int, coerce changes empty spaces to NaN
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True) #handle missing value directly in data
df['Churn'] = df['Churn'].map({'Yes':1,'No':0})
category_cols = df.select_dtypes(include=['object']).columns.to_list() #get column with categories
df = pd.get_dummies(df, columns=category_cols, drop_first=True) #one-hot encoding, convert categorical data to numerical format


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True) #handle missing value directly in data


In [4]:
X = df.drop('Churn', axis=1) #all features except churn, training data
y = df['Churn'] #churn feature, target value

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #80% train, 20% test
scaler = StandardScaler() #scales the datavalue to similar scale
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
# logistic regression

model = LogisticRegression(max_iter=1000) #choosing the model
model.fit(X_train, y_train) # train the model

y_pred = model.predict(X_test) # predict the target value using moddel parameter

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred)) #recall, precison, f1-score, support


Accuracy:  0.8197303051809794

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409



In [6]:
param_grid = {              # different hyperparameter to train model on
    'C':[0.01, 0.1, 1, 10], # c=1/lamda, lamda is regularization strength, greater c, less lambda, more flexible
    'penalty':['l1','l2'], # penalties to big or unnecessary model coefficients, l1=lasso, l2=ridge
    'solver':['liblinear'], #finds best weights
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000), #model to use with how many iteration model may use for optimized weights
    param_grid = param_grid,  
    cv = 5,                                      # cross-validation
    scoring = 'f1',
    verbose=1,
    n_jobs=-1                                    # how many cpu cores to use, -1 means all
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best F1 Score: 0.5928882039527489


In [8]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(df.info())
print(X.columns.to_list())

Accuracy: 0.8190205819730305
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.59      0.63       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.81      1409

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   gender_Male                            7043 non-null   b

In [8]:
joblib.dump(best_model, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')



['scaler.pkl']