In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA

In [None]:
churn = pd.read_csv('churn.csv')
churn.head()

In [None]:
churn.info()

In [None]:
churn['State'].value_counts(ascending = False)

In [None]:
churn['State'].unique()

In [None]:
top_15 = [x for x in churn['State'].value_counts(ascending = False).head(15).index]
top_15

In [None]:
for label in top_15:
    churn[label] = np.where(churn['State'] == label, 1, 0)
    
churn[['State']+top_15].head(30)

In [None]:
def one_hot_top_15(df, variable, top_15_labels):
    for label in top_15_labels:
        df[variable+'_'+label] = np.where(churn[variable] == label, 1, 0)
        

one_hot_top_15(churn, 'State', top_15)

In [None]:
churn.head()

In [None]:
churn.columns

In [None]:
churn.drop(['WV', 'MN', 'NY', 'AL', 'WI', 'OR', 'OH',
       'VA', 'WY', 'CT', 'ID', 'MI', 'VT', 'UT', 'TX', 'State'], axis = 1, inplace = True)

In [None]:
# churn.drop_duplicates(inplace = True)

In [None]:
churn.shape

In [None]:
churn.head()

In [None]:
churn.columns

In [None]:
# using label encoder on State column of churn dataset

In [None]:
#churn['State'] = churn['State'].astype('category') # change in category datatype
#churn.info()

In [None]:
#churn['State'] = churn['State'].cat.codes

In [None]:
churn.head()

In [None]:
# using one hot encoder on Area Code column of churn dataset

In [None]:
churn['Area Code'].value_counts()

In [None]:
churn = pd.get_dummies(churn, columns=['Area Code'], drop_first = True)

In [None]:
churn.head()

In [None]:
churn.drop(['Phone'], axis = 1, inplace = True)

In [None]:
# Using find and replace on 'Int\'l Plan' , 'VMail Plan' , 'Churn?' columns fo churn dataset

In [None]:
churn['Int\'l Plan'].value_counts() # Find how many category in 'Int\'l Plan' colunm

In [None]:
churn['VMail Plan'].value_counts() # Find how many category in 'VMail Plan colunm

In [None]:
churn['Churn?'].value_counts() # Find how many category in 'Churn?' colunm

In [None]:
replace_cols = {'Int\'l Plan':{'no': 0, 'yes': 1}, 'VMail Plan':{'no': 0, 'yes': 1}, 'Churn?':{'False.':0, 'True.':1}}

In [None]:
churn.replace(replace_cols, inplace = True)

In [None]:
churn.head()

In [None]:
churn.describe()

In [None]:
churn_cols = churn[['Account Length', 'Area Code_415', 'Area Code_510', 'Int\'l Plan', 'VMail Plan',
       'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins',
       'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge',
       'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']]

In [None]:
f,ax = plt.subplots(figsize = (18,18))
sns.heatmap(churn_cols.corr(), annot = True, fmt = '.1f', ax = ax, linewidths = 0.1)

In [None]:
# churn.corr()

In [None]:
churn.columns

In [None]:
churn[['Account Length', 'Area Code_415','Area Code_510', 'Int\'l Plan', 'VMail Plan',
       'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins',
       'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge',
       'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']].hist(bins = 30, figsize = (25,20))

In [None]:
churn.shape

In [None]:
churn.columns

In [None]:
churn.head()

In [None]:
real_x = churn[['Account Length', 'Int\'l Plan', 'VMail Plan', 'VMail Message',
       'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge', 'CustServ Calls', 'State_WV',
       'State_MN', 'State_NY', 'State_AL', 'State_OR', 'State_WI', 'State_OH',
       'State_VA', 'State_WY', 'State_CT', 'State_VT', 'State_ID', 'State_MI',
       'State_UT', 'State_TX', 'Area Code_415', 'Area Code_510']].values
real_y = churn['Churn?'].values

In [None]:
# real_x = churn.iloc[:,0:18].values
# real_y = churn.iloc[:,19].values

In [None]:
training_x, testing_x, training_y, testing_y = train_test_split(real_x, real_y, test_size = 0.30, random_state = 23)

In [None]:
training_x

In [None]:
# feature scaling of training_x and testing_x

In [None]:
stnd_scaler = StandardScaler()
training_x = stnd_scaler.fit_transform(training_x)
testing_x = stnd_scaler.fit_transform(testing_x)

In [None]:
# stnd_scaler = StandardScaler()
# stan_scalar_training_x = stnd_scaler.fit_transform(training_x)
# stan_scalar_testing_x = stnd_scaler.fit_transform(testing_x)

In [None]:
# PCA

In [None]:
# pca = PCA(n_components = 2)

In [None]:
# training_x = pca.fit_transform(stan_scalar_training_x)

In [None]:
# testing_x = pca.fit_transform(stan_scalar_testing_x)

In [None]:
training_x

In [None]:
testing_x

# Logistic Regression

In [None]:
classifier_log_reg = LogisticRegression()

In [None]:
classifier_log_reg.fit(training_x, training_y)

In [None]:
classifier_log_reg.score(training_x, training_y)

In [None]:
y_predict = classifier_log_reg.predict(testing_x)
y_predict

In [None]:
testing_y

In [None]:
classifier_log_reg.score(testing_x, testing_y)

In [None]:
con_mat = confusion_matrix(testing_y ,y_predict)

In [None]:
sns.heatmap(con_mat, annot= True, fmt = '')

In [None]:
# cross-validation

In [None]:
cross_val_score(classifier_log_reg , real_x , real_y , cv = 10 ).mean()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier_knn = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski')

In [None]:
classifier_knn.fit(training_x, training_y)

In [None]:
classifier_knn.score(training_x, training_y)

In [None]:
classifier_knn.score(testing_x, testing_y)

In [None]:
pred = classifier_knn.predict(testing_x)

In [None]:
con_mat = confusion_matrix(testing_y , pred)

In [None]:
sns.heatmap(con_mat, annot= True, fmt = '')

In [None]:
# cross validation 

In [None]:
k_range = list(range(3,26))

k_score = []

for i in k_range:
    classifier_knn = KNeighborsClassifier(n_neighbors = i)
    cvs = cross_val_score(classifier_knn , real_x , real_y , cv  = 10 )
    k_score.append(cvs.mean()*100)

plt.plot(k_range, k_score, 'o--')
plt.xlabel('value of k')
plt.ylabel('score')


In [None]:
# Grid Search CV

In [None]:
param_grid = dict(n_neighbors = k_range)

In [None]:
grid_search = GridSearchCV(estimator = classifier_knn, param_grid = param_grid, cv = 10, scoring = 'accuracy')
grid_search.fit(training_x, training_y)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
classifier_svc = SVC(random_state=23)
classifier_svc.fit(training_x,training_y)

In [None]:
classifier_svc.score(training_x, training_y)

In [None]:
classifier_svc.score(testing_x, testing_y)

In [None]:
predict = classifier_svc.predict(testing_x)

In [None]:
con_mat = confusion_matrix(testing_y , predict)

In [None]:
sns.heatmap(con_mat, annot= True, fmt = '')

In [None]:
# Cross Validation 

In [None]:
cross_val_score(classifier_svc , real_x , real_y , cv  = 10 ).mean()

In [None]:
# Grid Search CV

In [None]:
# parameters = [{'c': [0.1, 1, 10, 100, 1000], 'kernel' : ['linear']},
#               {'c': [0.1, 1, 10, 100, 1000], 'kernel' : ['rbf'], 'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
# parameters = {'c': [0.1, 1, 10, 100, 1000], 'kernel' : ['rbf'], 'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
parameters = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001],'kernel':['rbf']}

grid_search = GridSearchCV(estimator = classifier_svc, param_grid = parameters, refit = True)
grid_search.fit(training_x, training_y)

In [None]:
grid_search.best_score_

# Naive Bayse

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [None]:
gaussian_classifier = GaussianNB()

In [None]:
gaussian_classifier.fit(training_x, training_y)

In [None]:
gaussian_classifier.score(training_x , training_y)

In [None]:
gaussian_classifier.score(testing_x , testing_y)

In [None]:
predict = gaussian_classifier.predict(testing_x)

In [None]:
con_mat = confusion_matrix(testing_y , predict)

In [None]:
sns.heatmap(con_mat, annot= True, fmt = '')

In [None]:
cross_val_score(gaussian_classifier , real_x , real_y , cv  = 10 ).mean()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dec_tree = DecisionTreeClassifier(random_state = 23)
dec_tree.fit(training_x, training_y)

In [None]:
dec_tree.score(training_x , training_y)

In [None]:
dec_tree.score(testing_x , testing_y)

In [None]:
predict = dec_tree.predict(testing_x)

In [None]:
confusion_mat = confusion_matrix(testing_y, predict)

In [None]:
sns.heatmap(con_mat, annot= True, fmt = '')

In [None]:
cross_val_score(dec_tree , real_x , real_y , cv  = 10 ).mean()