# <a href="https://colab.research.google.com/github/GR-Tang/IBM-AI0403-Team3/blob/main/TelcoChurnV10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing the libraries needed for fiddling with the dataset
import pixiedust
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
sns.set_theme(style="darkgrid")

In [None]:
#importing the necessary libraries for later
#!pip install imblearn
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import plot_tree
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from scipy.stats import sem
import graphviz
from sklearn.ensemble import GradientBoostingClassifier 
from lightgbm import LGBMClassifier
import lightgbm as lgbm

In [None]:
#getting the dataset and loading it up
raw_df=pixiedust.sampleData('https://raw.githubusercontent.com/GR-Tang/IBM-AI0403-Team3/main/telco-data.csv')

In [None]:
#checking and verifying the dataset
raw_df.info()

In [None]:
#getting the headers to fiddle with the features
raw_df.head()

In [None]:
#can skip, for visualisation
#display(raw_df)

In [None]:
#can skip, for visualisation
#raw_df.groupby('Churn')[['MonthlyContract', 'OneYearContract', 'TwoYearContract']].sum()
#raw_df.groupby('Churn')[['PhoneService', 'MultipleLines', 'DeviceProtection']].sum()
#raw_df.groupby('Churn')[['InternetService', 'FiberOptic', 'DSL']].sum()
#raw_df.groupby('Churn')[['OnlineSecurity', 'OnlineBackup', 'TechSupport']].sum()
#raw_df.groupby('Churn')[['StreamingTV', 'StreamingMovies']].sum()
#raw_df.groupby('Churn')[['PayByBankTransfer', 'PayByCC', 'PayByElectronicCheque', 'PayByMailedCheque']].sum()

In [None]:
#one-hot codes in case needed
raw_df['SeniorCitizen']=raw_df['SeniorCitizen'].map({1:'Yes', 0:'No'})
raw_df['MultipleLines']=raw_df['MultipleLines'].map({'Yes':'Yes', 'No':'No','No phone service':'No'})
raw_df['OnlineSecurity']=raw_df['OnlineSecurity'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df['OnlineBackup']=raw_df['OnlineBackup'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df['DeviceProtection']=raw_df['DeviceProtection'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df['TechSupport']=raw_df['TechSupport'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df['StreamingTV']=raw_df['StreamingTV'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df['StreamingMovies']=raw_df['StreamingMovies'].map({'Yes':'Yes', 'No':'No','No internet service':'No'})
raw_df=pd.get_dummies(raw_df, columns=['gender','Partner', 'SeniorCitizen', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], dtype='int64')
#keeping Churn as binary class due to it being target feature
raw_df['Churn']=raw_df['Churn'].map({'Yes': 1, 'No': 0})
raw_df['TotalCharges']=raw_df['TotalCharges'].replace({' ':'0'}).astype(float)
raw_df.drop(columns=['customerID'], inplace = True)

In [None]:
#grouping tenure and monthlycharges into bins
raw_df['Bintenure']=(raw_df['tenure']/6).apply(np.ceil).astype("int64")
raw_df['BinMonthlyCharges']=(raw_df['MonthlyCharges']/10).apply(np.ceil).astype("int64")
raw_df['BinTotalCharges']=(raw_df['TotalCharges']/500).apply(np.ceil).astype("int64")

In [None]:
#shifting the target column to position 0 for ease of reference later
churn_df = raw_df['Churn']
raw_df.drop(columns=['Churn'], inplace = True)
raw_df.insert(0, 'Churn', churn_df)

In [None]:
#dropping unwanted columns
imbaldrop2bin_df= raw_df.drop(columns=['gender_Male', 'gender_Female','MonthlyCharges','TotalCharges','tenure'])
imbaldrop7bin_df=imbaldrop2bin_df.drop(columns=['PhoneService_Yes','PhoneService_No','StreamingTV_Yes','StreamingTV_No','StreamingMovies_Yes','StreamingMovies_No','MultipleLines_Yes','MultipleLines_No','BinTotalCharges'])

In [None]:
imbaldrop2bin_df.head()

In [None]:
#noticed target data is imbalanced, we can either undersample (randomly reduce negatives to match the number of positives) 
#or oversample (randomly generate synthetic positives to match the number of negatives)
#however, almost all articles points to balancing after splitting into test/train sets. 
pd.value_counts(raw_df['Churn'])

In [None]:
#Selecting the CSV file to use
raw_df=imbaldrop2bin_df

In [None]:
#taking 10% of data as test data, the rest to train 
train_df, test_df = train_test_split(raw_df, test_size=0.1, stratify=raw_df['Churn'], random_state=55)

#printing to check size
print("Size of the training dataset = ", train_df.shape)
print("Size of the testing dataset = ", test_df.shape)

#show sample of the dataset to verify
print("\n\nSample of the training dataset \n")
train_df.head()

In [None]:
pd.value_counts(train_df['Churn'])

In [None]:
#plot and show the current imbalance
rcParams['figure.figsize'] = 6,5
sns.countplot(x='Churn', data=train_df)
plt.title('Imbalanced Churns')
plt.show()

#undersampling NOTE - Choose the one below or this, do not use both
#shuffle the Dataset. Everyday we're shuffling..shuffling... shuffling... (frac=1 means gimme back all the rows)
shuffled_df = train_df.sample(frac=1,random_state=555)

#pull out all the churn positives
churnPos_df = shuffled_df.loc[shuffled_df['Churn'] == 1]

#randomly pick 1682 rows from the ChurnNegatives (majority)
churnNeg_df = shuffled_df.loc[shuffled_df['Churn'] == 0].sample(n = 1682, random_state = 55)

#joining them back again
train_df = pd.concat([churnNeg_df, churnPos_df])


In [None]:
#oversampling NOTE - Choose the one above or this, do not use both
sm = SMOTE(sampling_strategy=0.6667, random_state=55)

oversam_train_X, oversam_train_Y = sm.fit_sample(train_df.drop('Churn', axis=1), train_df['Churn'])
train_df = pd.concat([pd.DataFrame(oversam_train_Y), pd.DataFrame(oversam_train_X)], axis=1)

In [None]:
pd.value_counts(train_df['Churn'])

In [None]:
#Now plot and show the corrected balance
rcParams['figure.figsize'] = 6,5
sns.countplot(x='Churn', data=train_df)
plt.title('Balanced Churns')
plt.show()

In [None]:
#define target column for both test and train dataset
train_X, train_Y = train_df.iloc[:,1:],train_df.iloc[:,0]
test_X, test_Y = test_df.iloc[:,1:],test_df.iloc[:,0]

In [None]:
#selecting the model, parameters used for the first run is to suppress error messages
model1 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0)

#fitting the model to the training dataset
model1.fit(train_X, train_Y)

In [None]:
def assess_model(model, test_X, test_Y, folds):
    #calculating 
    model_preds = model.predict(test_X)
    accuracy = accuracy_score(test_Y, model_preds)
    model_probs = model.predict_proba(test_X)
    model_probs = model_probs[:, 1]
    model_auc = roc_auc_score(test_Y, model_probs)
    model_fpr, model_tpr, _ = roc_curve(test_Y, model_probs)
    model_precision, model_recall, _ = precision_recall_curve(test_Y, model_probs)
    model_f1 = f1_score(test_Y, model_preds)
    score = cross_val_score(model, train_X, train_Y, cv=folds, scoring='roc_auc')
 
    print('Model: %s\n' % (model))
    print((folds),'Folds Cross Validation ROC AUC Scores: ', (score))
    print('Mean ROC AUC score: {0:.3f} (+/-{1:.3f})'.format(np.mean(score), sem(score)))
    print('\nTest Set score:')
    print('Accuracy: %.2f' % (accuracy * 100.0))
    print('ROC AUC=%.3f' % (model_auc))
    print('F1=%.3f' % (model_f1))

    
    #plotting confusion matrix
    rcParams['figure.figsize'] = 6,5
    fig, ax = plt.subplots()
    cm = confusion_matrix(test_Y, model_preds)
    tp = cm[1,1]
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, ax = ax)
    ax.set_title('Confusion Matrix')
    ax.set_xlabel("Predicted class")
    ax.set_ylabel("Actual class")
    plt.show()
    #plot roc/precision/recall
    rcParams['figure.figsize'] = 10,8
    plt.plot(model_fpr, model_tpr, marker='.', label='ROC')
    plt.plot(model_recall, model_precision, marker='.', label='Recall/Precision')
    plt.xlabel('Recall\nFalse Positive Rate')
    plt.ylabel('Precision\nTrue Positive Rate')
    plt.legend()
    plt.show()

    #plot feature importance (since we are only using two models, using try/except)
    rcParams['figure.figsize'] = 8,8
    feat_importances = pd.Series(model.feature_importances_, index=train_X.columns)
    feat_importances.nlargest(38).plot(kind='barh').invert_yaxis()
 

#calculate mean square error, not sure if useful for binary classification, included for fun 
#MSE = np.sqrt(mean_squared_error(test_Y, xgb1_pred_Y))
#print("XGBoost1 MSE: %f" % (MSE))

In [None]:
#visualising the decision tree for shits and giggles
rcParams['figure.figsize'] = 30,40
xgb.plot_tree(model1, rankdir='LR')
plt.show()

In [None]:
assess_model(model1,test_X,test_Y,3)

In [None]:
#this will take very very long (bob marley:"I've been watching you~~") to run due to the various combinations
#thought of increasing cores used but refrained, for portability

model2 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0)

param_grid = {'eta':[0.1, 0.2, 0.3, 0.4, 0.5],
              'max_depth':[3, 4, 5, 6, 7],
              'gamma':[0, 3, 6, 9, 12],
              'subsample':[0.1, 0.25, 0.5, 0.75, 1]
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=55)
grid_search = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=kfold)
grid_result = grid_search.fit(train_X, train_Y)

# summarize results
print("Best ROC AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("ROC AUC %f (%f) with: %r" % (mean, stdev, param))

In [None]:
#Refit the model with new optimised parameters
model3 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, **grid_result.best_params_)

#fitting the model to the training dataset
model3.fit(train_X, train_Y)

#assess the model with optimised parameters
assess_model(model3,test_X,test_Y,3)

In [None]:
#selecting the alternative model and fitting it with train data, using default params 
model4 = LGBMClassifier()
model4.fit(train_X, train_Y)

In [None]:
#assess the default LGBM model to get a baseline scoring
assess_model(model4,test_X,test_Y,3)

In [None]:
#perform gridsearchcv LGBM model using a few parameters, 
#this process takes very long since it will run all the possible different combinations of the parameters
#params which default is better: 'boosting_type':['gbdt', 'rf'],'learning_rate':[0.025, 0.050, 0.075, 0.1, 0.2]

model5 = LGBMClassifier(verbosity=-1)

param_grid = {'learning_rate':[0.025, 0.05, 0.1, 0.2, 0.3],
              'extra_trees':[True, False],
              'max_bin':[10, 55, 155, 255, 280],
              'max_delta_step':[-1, 10, 20, 30, 50]
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=55)
grid_search = GridSearchCV(model5, param_grid, scoring='roc_auc', cv=kfold)
grid_result = grid_search.fit(train_X, train_Y)

# summarize results
print("Best ROC AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("ROC AUC %f (%f) with: %r" % (mean, stdev, param))

In [None]:
#todo - fit lgbm model with optimised parameters 
model6 = LGBMClassifier(verbosity=-1, **grid_result.best_params_)
model6.fit(train_X, train_Y)

In [None]:
#todo - assess updated LGBM model with optimised parameters
assess_model(model6,test_X,test_Y,3)

In [None]:
model7 = GradientBoostingClassifier()
model7.fit(train_X, train_Y)

In [None]:
#todo - assess updated LGBM model with optimised parameters
assess_model(model7,test_X,test_Y,3)

In [None]:
#perform gridsearch for GBclassifier model using a few parameters, 
#this process takes very long since it will run all the possible different combinations of the parameters

model8 = GradientBoostingClassifier()

param_grid = {'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3],
              'n_estimators':[80, 90, 100, 110, 120],
              'subsample':[0.01, 0.25, 0.5, 0.75, 1.0],
              'max_depth':[1, 2, 3, 5, 7]
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=55)
grid_search = GridSearchCV(model8, param_grid, scoring='roc_auc', cv=kfold)
grid_result = grid_search.fit(train_X, train_Y)

# summarize results
print("Best ROC AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("ROC AUC %f (%f) with: %r" % (mean, stdev, param))

In [None]:
model9 = GradientBoostingClassifier(**grid_result.best_params_)
model9.fit(train_X, train_Y)
assess_model(model9,test_X,test_Y,3)

In [None]:
import joblib
import pickle

#pickling the model
f='model.pkl'
with open(f,'wb') as file:
    pickle.dump(model6, file)
loaded_model = joblib.load(f)

In [None]:
#test loading the pickled model
ChurnPred = loaded_model.predict([[1,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,1,1,6,1]])
print('Churn prediction = %d\n(1=Yes, 0=No)' % (ChurnPred)) 

In [None]:
from google.cloud import storage
import os

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./IMVAI0403-DS-Project-7521954d761a.json"

storage_client = storage.Client()

In [None]:
bucket = storage_client.create_bucket('telco-churn-bucket')

In [None]:
for bucket in storage_client.list_buckets():
    print(bucket)

In [None]:
blob = bucket.blob('model.pkl')
blob.upload_from_filename('./model.pkl')

#Prep for deployment on IBM Cloud
#IMPORTANT - READ BELOW:
#IMPORTANT - ALL CODES BELOW ARE FOR TECH DEMONSTRATION ONLY, DO NOT RUN. CELLS WILL BE MARK DOWN IN FUTURE VERSIONS.
#IMPORTANT - READ ABOVE
#!pip install ibm-watson-machine-learning
from ibm_watson_machine_learning import APIClient

#creating the login credentials and getting the IAM token from IBM Cloud
# @hidden_cell
api_key = "<removed>"
wml_credentials = {
    "apikey": api_key,
    "url": 'https://us-south.ml.cloud.ibm.com'
}
client = APIClient(wml_credentials)

#getting the list of spaces available on my cloud account
client.spaces.list()

#setting the space this client will be working on
client.set.default_space("163085a0-83fa-4fed-8c4e-07e486aa743c")

#verifying my own system environment
import sys
sys.version

#checking my package version
xgb.__version__

#getting a list of software specifications available on IBM cloud 
#do note that it's not fantastically compatible, maybe coz i'm on still a free account 
#free stuff is good stuff, at least for learning purposes
client.software_specifications.list()

#Storing the trained model in the cloud space together with the training data 
metadata = {
    client.repository.ModelMetaNames.NAME: 'Telco Churn Prediction Model',
    client.repository.ModelMetaNames.TYPE: 'scikit-learn_0.22',
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: "154010fa-5b3b-4ac1-82af-4d5ee5abbc85"
}
published_model = client.repository.store_model(
    model=model1,
    meta_props=metadata,
    training_data=train_X,
    training_target=train_Y)

#listing the stored model(s) in my repository
models_details = client.repository.list_models()

#selecting the stored model by UID and deploying it online
metadata = {
    client.deployments.ConfigurationMetaNames.NAME: "Deployment of Telco Churn Prediction Model",
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

created_deployment = client.deployments.create("8597546e-fac9-4cc4-99e0-ae5646f586cc", meta_props=metadata)

#listing the instances of deployed models
client.deployments.list()

#dumping some numbers into the deployed model for predictions
scoring_payload = {"input_data": [{"values": [[0,0,0,0,3,1,1,1,0,0,1,1,1,1,1,0,10,50,1,1,0,0,0,1,0,0]]}]}
predictions = client.deployments.score("7c6da0f0-ed51-4c0d-8434-0594b3024ca7", scoring_payload)