# Import the necessary libraries

In [49]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

In [50]:
np.random.seed(40)

### Load the Customer Data into dataframe called 'data'

In [51]:
data = pd.read_excel("Customer_Data.xlsx",sep=',',index_col=0)

In [52]:
data = data.reset_index()

> + Reseting the indexing `Location`

### Looking for top five records

In [53]:
data.head()

Unnamed: 0,Location,Account_Seq,Code_415,Code_510,International_Plan,Voice_mail_Plan,Num_of_Voice_mail_Messages,Total_Day_Minutes,Total_Day_Calls,Total_day_Charge,...,Total_Eve_Calls,Total_Eve_Charge,Total_Night_Minutes,Total_Night_Calls_,Total_Night_Charge,Total_International_Minutes,Total_Intl_Calls,Total_Intl_Charge,Number_Customer_Service_calls_,Satisfied
0,KS,128,1,0,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,1,0,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,1,0,0,0,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,0,0,1,0,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,1,0,1,0,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [54]:
data.shape

(5000, 21)

> + Dataset is having `5000` records with `21` columns

## Let's look for the datatype and other information of our data

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 21 columns):
Location                          5000 non-null object
Account_Seq                       5000 non-null int64
Code_415                          5000 non-null int64
Code_510                          5000 non-null int64
International_Plan                5000 non-null int64
Voice_mail_Plan                   5000 non-null int64
Num_of_Voice_mail_Messages        5000 non-null int64
Total_Day_Minutes                 5000 non-null float64
Total_Day_Calls                   5000 non-null int64
Total_day_Charge                  5000 non-null float64
Total_Eve_Minutes                 5000 non-null float64
Total_Eve_Calls                   5000 non-null int64
Total_Eve_Charge                  5000 non-null float64
Total_Night_Minutes               5000 non-null float64
Total_Night_Calls_                5000 non-null int64
Total_Night_Charge                5000 non-null float64
Total_Internatio

> - All columns except `Location` and `Satisfied` are numerical 

## Let's convert the categorical column to numerical columns
> - We can use label encoder

In [56]:
y = data['Satisfied']
X = data.drop('Satisfied',axis=1)

> - X is having all the input feature assigned to it and target(`Satisfied`) is assign to y.

In [57]:
encoder = LabelEncoder()
X['Location'] = encoder.fit_transform(X['Location'])
y = pd.DataFrame(encoder.fit_transform(y))

## Overview of our transformed data

In [58]:
X.head()

Unnamed: 0,Location,Account_Seq,Code_415,Code_510,International_Plan,Voice_mail_Plan,Num_of_Voice_mail_Messages,Total_Day_Minutes,Total_Day_Calls,Total_day_Charge,Total_Eve_Minutes,Total_Eve_Calls,Total_Eve_Charge,Total_Night_Minutes,Total_Night_Calls_,Total_Night_Charge,Total_International_Minutes,Total_Intl_Calls,Total_Intl_Charge,Number_Customer_Service_calls_
0,16,128,1,0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,35,107,1,0,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,31,137,1,0,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,35,84,0,0,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,36,75,1,0,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [59]:
y[0:5]

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


## Normalizing data for better performance.

In [60]:
from sklearn.preprocessing import StandardScaler

In [61]:
sc=StandardScaler()
scaled_X = pd.DataFrame(sc.fit_transform(X))

In [62]:
np.random.seed(42)
X_train1, X_test, y_train1, y_test = train_test_split(scaled_X, y, test_size = 0.3)

In [63]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3500 entries, 1840 to 860
Data columns (total 20 columns):
0     3500 non-null float64
1     3500 non-null float64
2     3500 non-null float64
3     3500 non-null float64
4     3500 non-null float64
5     3500 non-null float64
6     3500 non-null float64
7     3500 non-null float64
8     3500 non-null float64
9     3500 non-null float64
10    3500 non-null float64
11    3500 non-null float64
12    3500 non-null float64
13    3500 non-null float64
14    3500 non-null float64
15    3500 non-null float64
16    3500 non-null float64
17    3500 non-null float64
18    3500 non-null float64
19    3500 non-null float64
dtypes: float64(20)
memory usage: 574.2 KB


# Question 1
> - **a. Implement a 10-Fold cross-validation**

In [64]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
skf = StratifiedKFold(n_splits=10,shuffle = True)
skf.get_n_splits(X_train1, y_train1)

10

In [65]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import recall_score,precision_score,confusion_matrix,classification_report

In [66]:
def evaluation_process(X_train, y_train, n_folds, shuffled):
    
    seed = np.random.seed(42)
    st = StratifiedKFold(n_splits=n_folds,shuffle=shuffled,random_state=seed)
    n = []
    a = cross_val_score(LR(), X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    n.append(a)
    
    b = cross_val_score(DT(), X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    n.append(b)
    
    c = cross_val_score(RF(), X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    n.append(c)
    
    best = n.index(max(n))
    models = [LR(), DT(), RF()]
    best_model = models[best]
    
    # Output should be one of the classifiers in the models list
    return best_model

In [67]:
best_model = evaluation_process(X_train1, y_train1, 10, True)

In [68]:
best_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
def get_evaluation_test_set(model, X_test, y_test):
    
    np.random.seed(42)
    accuracy = 0
    precision = 0
    auroc = 0
    model.fit(X_test,y_test)
    accuracy = model.score(X_test,y_test)
    prediction_1 = model.predict(X_test)
    precision = precision_score(y_test,prediction_1)
    auroc = roc_auc_score(y_test,prediction_1)
    print(accuracy, precision, auroc)
    return accuracy, precision, auroc

get_evaluation_test_set(best_model,X_test, y_test) == (0.9966666666666667, 1.0, 0.9879227053140096) #Should equal true

0.9946666666666667 1.0 0.9806763285024155


False