# Members:
### Syed Asad Rizvi (ERP ID 25365)
### Fareed Hassan Khan (ERP ID 25367)

_____

Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from numpy import mean
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from mixed_naive_bayes import MixedNB
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense

Importing dataset

In [None]:
df = pd.read_csv('../Telecom_customer churn.csv')

In [None]:
df.shape

Cleaning the dataset

In [None]:
df.dropna(subset=['rev_Mean', 'kid11_15', 'dualband', 'area', 'hnd_price', 'change_mou'], inplace=True)
df.drop(['avg6mou', 'avg6qty', 'avg6rev', 'prizm_social_one', 'ownrent', 'lor', 'dwlltype', 'adults', 'infobase', 'numbcars', 
'HHstatin', 'dwllsize', 'income', 'hnd_webcap'], axis=1, inplace=True)
df.isna().sum().sum()

In [None]:
df.shape

Onehot Encoding

In [None]:
df_onehot = pd.get_dummies(df)

In [None]:
df_onehot

Fitting Model Code

In [None]:
def fit_model(model, model_name):
    model.fit(trainX,trainy)
    md_probs = model.predict_proba(testX)
    md_probs = md_probs[:,1]
    md_auc = roc_auc_score(testy, md_probs)
    print(model_name, " : ", md_auc)
    md_fpr, md_tpr, _ = roc_curve(testy, md_probs)
    # plt.plot(md_fpr, md_tpr, marker='.', label=model_name)

Gradient Boosting

In [None]:
df_onehot = df_onehot.loc[:, df_onehot.columns != 'churn']
y = df[['churn']]

trainX, testX, trainy, testy = train_test_split(df_onehot, y, test_size=0.3, random_state=2)

In [None]:
gb = GradientBoostingClassifier(max_depth=5,n_estimators=200)
fit_model(gb, "Graident Boosting")

In [None]:
pipe_lg = Pipeline([("scaler", MinMaxScaler()),("Logistic", LogisticRegression())])
fit_model(pipe_lg, "Logistic")

In [None]:
dt = DecisionTreeClassifier(max_depth=5)  
fit_model(dt, "Decision Tree") 

In [None]:
rf = RandomForestClassifier(max_depth=20,n_estimators=1000)
fit_model(rf, "Random Forest")

In [None]:
pipe_kn = Pipeline([("scaler", MinMaxScaler()),("KNN", KNeighborsClassifier(n_neighbors=500))])
fit_model(pipe_kn, "KNN")

Naive Bayes

In [None]:
numeric_columns = list(df.columns[df.dtypes != 'object'])
categorical_columns = list(df.columns[df.dtypes == 'object'])
categorical_columns.append('churn')

In [None]:
def convert_categorical(df1):
    df_q = pd.DataFrame()
    label_encoder = LabelEncoder()
    for col in df1:
        if col not in categorical_columns:
            df_q[col] = pd.qcut(df1[col], 5, duplicates='drop')            
            df_q[col]= label_encoder.fit_transform(df_q[col])
            df_q[col] = df_q[col].astype('str')

    X_cat = df1[categorical_columns]
    df_cat = pd.concat([df_q,X_cat],axis=1)
    return df_cat

 
temp_df1 = convert_categorical(df) 
temp_df1.head()

In [None]:
nb_c = CategoricalNB(min_categories = 100)
fit_model(nb_c, "Naive Bayes Categorical")

In [None]:
nb_mix = MixedNB(categorical_features=[1,2,3])
fit_model(nb_mix, "Naive Bayes Mixed")

In [None]:
nb_g = GaussianNB()
fit_model(nb_g, "Gaussian")

Neural Network

In [None]:
scaler = StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)
trainX.shape
model = Sequential()
model.add(Dense(187, input_dim=187, activation='relu'))
model.add(Dense(187, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainX, trainy, epochs=5, batch_size=10)
_, accuracy = model.evaluate(testX, testy)
print('Accuracy: %.2f' % (accuracy*100))

Bagging

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
reg_bg = BaggingClassifier(base_estimator=GradientBoostingClassifier(max_depth=5, n_estimators=200),
                        n_estimators=20, random_state=0)
scores = cross_val_score(reg_bg, df_onehot, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

Stacking

In [None]:
#Stacking

cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
estimators = [
('lr', LogisticRegression()),
('dt', DecisionTreeClassifier(max_depth=5)),
('rf', RandomForestClassifier(max_depth=20, n_estimators=1000))
]

reg_sr = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(max_depth=5, n_estimators=200, random_state=42))
scores = cross_val_score(reg_sr, df_onehot, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

Voting Classifier

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=1)#, random_state=1)
r1 = DecisionTreeClassifier(max_depth=5)
r2 = RandomForestClassifier(max_depth=20,n_estimators=1000)
r3 = GradientBoostingClassifier(max_depth=5,n_estimators=200)

reg_vr = VotingClassifier([('dt', r1), ('rf', r2),('gb', r3)])
scores = cross_val_score(reg_vr, df_onehot, y, cv=cv)
score = format(mean(scores), '.4f')
print(score)

Filling Missing Values

In [None]:
# Float values
x = df['avg6mou'].mode()
df['avg6mou'].fillna(x[0], inplace=True)
y = df['avg6qty'].mode()
df['avg6qty'].fillna(y[0], inplace=True)
z = df['avg6rev'].mode()
df['avg6rev'].fillna(z[0], inplace=True)
a = df['lor'].mode()
df['lor'].fillna(a[0], inplace=True)
b = df['adults'].mode()
df['adults'].fillna(b[0], inplace=True)
c = df['income'].mode()
df['income'].fillna(c[0], inplace=True)
d = df['numbcars'].mode()
df['numbcars'].fillna(d[0], inplace=True)

# Categorical
e = df['prizm_social_one'].mode()
df['prizm_social_one'].fillna(e[0], inplace=True)
f = df['hnd_webcap'].mode()
df['hnd_webcap'].fillna(f[0], inplace=True)
g = df['ownrent'].mode()
df['ownrent'].fillna(g[0], inplace=True)
h = df['infobase'].mode()
df['infobase'].fillna(h[0], inplace=True)
i = df['HHstatin'].mode()
df['HHstatin'].fillna(i[0], inplace=True)
j = df['dwllsize'].mode()
df['dwllsize'].fillna(j[0], inplace=True)
k = df['dwlltype'].mode()
df['dwlltype'].fillna(j[0], inplace=True)

Grid Search

In [None]:
X = df_onehot.loc[:, df_onehot.columns != 'churn']
y = df_onehot[['churn']]

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
regRF = GradientBoostingClassifier(max_depth=5, random_state=0)
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'max_features': [2, 3, 4],    
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300] 
}
grid_search = GridSearchCV(estimator = regRF, param_grid=param_grid, cv = cv, n_jobs = -1, verbose = 2)
grid_search.fit(X, y)
best_grid = grid_search.best_estimator_
print(best_grid)

Cross Validation K-Fold

In [None]:
score_onehot = []
s_no = []
for i in range(0,10):
    # prepare the cross-validation procedure
    cv = KFold(n_splits=10, random_state=i, shuffle=True)
    reg = LogisticRegression()
    

    scores = cross_val_score(reg, df_onehot, y, scoring='roc_auc', cv=cv) 
    score_onehot.append(mean(scores))
    
    s_no.append(i)
    
scores_df = pd.DataFrame(
    {'S #': s_no,
     'onehot': score_onehot
    })
scores_df.head(10)

Feature Importance

In [None]:
X = df_onehot.loc[:, df_onehot.columns != 'churn']
y = df_onehot[['churn']]

In [None]:
clf = GradientBoostingClassifier(max_depth=5, n_estimators=200, random_state=0)

clf.fit(X,y)

feature_scores = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)

feature_scores

## Winner Model

In [None]:
gb = GradientBoostingClassifier(max_depth=5,n_estimators=200)
fit_model(gb, "Graident Boosting")