<a href="https://colab.research.google.com/github/ASWANTH-J/ASWANTH-J/blob/master/telecom_churn_prediction_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# libraries for data manipulation
import pandas as pd
import numpy as np

# libraries for data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
#for multple output in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from  sklearn.model_selection import train_test_split
from  sklearn.preprocessing   import  LabelEncoder
from  sklearn.linear_model    import  LogisticRegression
from  sklearn.metrics         import  f1_score,accuracy_score,precision_score,recall_score,confusion_matrix

In [3]:
#importing dataset
data = pd.read_csv("/content/drive/MyDrive/MSC/mini_project/telcom_customer_dataset.csv")

##**Data Preprocessing**

In [4]:
data.iloc[lambda data: data['TotalCharges'] == ' ',19] = np.nan

In [5]:
data['TotalCharges'] = data['TotalCharges'].astype(float)

In [6]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
features = [col for col in data.columns if col not in ['customerID','Churn']]
TARGET = 'Churn'

In [8]:
num_cols = [
            'tenure',
            'MonthlyCharges',
            'TotalCharges'
]

In [9]:
cat_cols = [col for col in features if col not in num_cols]
cat_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [10]:
le=dict()
for col in cat_cols:
  le[col] = LabelEncoder()
  data[col] = le[col].fit_transform(data[col])

In [11]:
data[TARGET] = data[TARGET].map({'Yes':1,'No':0})

In [12]:
data = data.dropna()

train_test_split

In [23]:
train,test = train_test_split(data,test_size=0.2,random_state=43)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4073,7683-CBDKJ,1,0,1,1,14,1,0,0,2,2,0,0,2,0,0,1,2,65.45,937.60,1
156,9167-APMXZ,0,0,0,0,22,1,2,1,0,0,0,0,2,0,0,1,0,84.15,1821.95,0
6036,8008-ESFLK,0,0,1,0,53,1,0,1,2,2,2,2,2,2,1,0,2,110.50,5835.50,0
5019,3320-VEOYC,1,1,0,0,14,1,2,1,0,0,0,0,2,2,0,1,2,95.60,1273.30,0
4948,3446-QDSZF,0,0,0,0,4,1,0,0,0,0,0,0,2,0,0,0,1,55.50,227.35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6211,4439-JMPMT,0,0,1,1,5,1,0,1,0,0,0,2,2,0,0,1,2,85.75,470.95,1
2330,1104-FEJAM,1,0,1,1,28,1,2,0,0,2,0,0,0,2,0,0,2,64.40,1802.15,0
2308,3324-OIRTO,1,0,1,1,71,1,0,1,2,2,2,0,2,2,1,1,0,104.65,7288.40,0
3398,3096-WPXBT,0,0,1,1,61,1,2,1,2,2,2,0,0,2,1,1,1,100.70,6018.65,0



##**Modelling**

In [27]:
def run_clf_kflod(clf,train,test,features,target,scaling=False):
  N_SPLITS = 5

  from sklearn.model_selection import StratifiedKFold
  from sklearn.preprocessing import StandardScaler
  oofs = np.zeros(train.shape[0])
  preds = np.zeros(test.shape[0])

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_,(trn_idx,val_idx) in enumerate(folds.split(train,train[target])):
    print(f'\n-------------------fold{fold_+1}----------------------')

    #train data
    X_trn,y_trn = train[features].iloc[trn_idx],train[target].iloc[trn_idx]

    #Validation data
    X_val,y_val = train[features].iloc[val_idx],train[target].iloc[val_idx]

    #Test data
    X_test = test[features]

    #Scaling data if required
    if scaling:
      scaler = StandardScaler()
      _ = scaler.fit(X_trn)

      X_trn = scaler.transform(X_trn)
      X_val = scaler.transform(X_val)
      X_test = scaler.transform(X_test)
    #Train the models
    _ = clf.fit(X_trn,y_trn)
    pred_val=clf.predict_proba(X_val)[:,1]
    pred_test = clf.predict_proba(X_test)[:,1]

    fold_score = f1_score(y_val,pred_val.round())
    print(f"f1_score for validation set of fold{fold_+1} : {fold_score}")

    oofs[val_idx] = pred_val
    preds += pred_test.round() / N_SPLITS
  oofs_score = f1_score(train[target],oofs.round())
  print(f'f1_score score of oofs : {oofs_score}')

  return oofs,preds

    

LogisticRegression

In [15]:
le = LogisticRegression(max_iter=300)
_ =le.fit(X_train,y_train)
y_pred = le.predict(X_test)
print('f1_score : '+str(f1_score(y_test,y_pred)))

NameError: ignored

In [28]:
oofs,preds = run_clf_kflod(le,train,test,features,TARGET)


-------------------fold1----------------------
f1_score for validation set of fold1 : 0.6110124333925399

-------------------fold2----------------------
f1_score for validation set of fold2 : 0.5966850828729281

-------------------fold3----------------------
f1_score for validation set of fold3 : 0.6032906764168191

-------------------fold4----------------------
f1_score for validation set of fold4 : 0.6208112874779541

-------------------fold5----------------------
f1_score for validation set of fold5 : 0.5692883895131086
f1_score score of oofs : 0.6005809731299927


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
