# Import the necessary libraries

In [27]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

In [28]:
np.random.seed(40)

### Load the Customer Data into dataframe called 'data'

In [29]:
data = pd.read_excel("Customer_Data.xlsx",sep=',',index_col=0)

In [30]:
data = data.reset_index()

> + Reseting the indexing `Location`

### Looking for top five records

In [31]:
data.head()

Unnamed: 0,Location,Account_Seq,Code_415,Code_510,International_Plan,Voice_mail_Plan,Num_of_Voice_mail_Messages,Total_Day_Minutes,Total_Day_Calls,Total_day_Charge,...,Total_Eve_Calls,Total_Eve_Charge,Total_Night_Minutes,Total_Night_Calls_,Total_Night_Charge,Total_International_Minutes,Total_Intl_Calls,Total_Intl_Charge,Number_Customer_Service_calls_,Satisfied
0,KS,128,1,0,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,1,0,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,1,0,0,0,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,0,0,1,0,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,1,0,1,0,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [32]:
data.shape

(5000, 21)

> + Dataset is having `5000` records with `21` columns

## Let's look for the datatype and other information of our data

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 21 columns):
Location                          5000 non-null object
Account_Seq                       5000 non-null int64
Code_415                          5000 non-null int64
Code_510                          5000 non-null int64
International_Plan                5000 non-null int64
Voice_mail_Plan                   5000 non-null int64
Num_of_Voice_mail_Messages        5000 non-null int64
Total_Day_Minutes                 5000 non-null float64
Total_Day_Calls                   5000 non-null int64
Total_day_Charge                  5000 non-null float64
Total_Eve_Minutes                 5000 non-null float64
Total_Eve_Calls                   5000 non-null int64
Total_Eve_Charge                  5000 non-null float64
Total_Night_Minutes               5000 non-null float64
Total_Night_Calls_                5000 non-null int64
Total_Night_Charge                5000 non-null float64
Total_Internatio

> - All columns except `Location` and `Satisfied` are numerical 

## Let's convert the categorical column to numerical columns
> - We can use label encoder

In [34]:
y = data['Satisfied']
X = data.drop('Satisfied',axis=1)

> - X is having all the input feature assigned to it and target(`Satisfied`) is assign to y.

In [35]:
encoder = LabelEncoder()
X['Location'] = encoder.fit_transform(X['Location'])
y = pd.DataFrame(encoder.fit_transform(y))

## Overview of our transformed data

In [36]:
X.head()

Unnamed: 0,Location,Account_Seq,Code_415,Code_510,International_Plan,Voice_mail_Plan,Num_of_Voice_mail_Messages,Total_Day_Minutes,Total_Day_Calls,Total_day_Charge,Total_Eve_Minutes,Total_Eve_Calls,Total_Eve_Charge,Total_Night_Minutes,Total_Night_Calls_,Total_Night_Charge,Total_International_Minutes,Total_Intl_Calls,Total_Intl_Charge,Number_Customer_Service_calls_
0,16,128,1,0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,35,107,1,0,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,31,137,1,0,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,35,84,0,0,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,36,75,1,0,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [37]:
y[0:5]

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [67]:
np.random.seed(42)
X_train1, X_test, y_train1, y_test = train_test_split(scaled_X, y, test_size = 0.3)

In [68]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3500 entries, 1840 to 860
Data columns (total 20 columns):
Location                          3500 non-null int32
Account_Seq                       3500 non-null int64
Code_415                          3500 non-null int64
Code_510                          3500 non-null int64
International_Plan                3500 non-null int64
Voice_mail_Plan                   3500 non-null int64
Num_of_Voice_mail_Messages        3500 non-null int64
Total_Day_Minutes                 3500 non-null float64
Total_Day_Calls                   3500 non-null int64
Total_day_Charge                  3500 non-null float64
Total_Eve_Minutes                 3500 non-null float64
Total_Eve_Calls                   3500 non-null int64
Total_Eve_Charge                  3500 non-null float64
Total_Night_Minutes               3500 non-null float64
Total_Night_Calls_                3500 non-null int64
Total_Night_Charge                3500 non-null float64
Total_Internati

## Normalizing data for better performance.

In [61]:
from sklearn.preprocessing import StandardScaler

In [66]:
sc=StandardScaler()
scaled_X = pd.DataFrame(sc.fit_transform(X))

# Question 1
> - **a. Implement a 10-Fold cross-validation**

In [47]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
skf = StratifiedKFold(n_splits=10,shuffle = True)
skf.get_n_splits(X_train, y_train)

10

In [49]:
from sklearn.tree import DecisionTreeClassifier      # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics                          # Import scikit-learn metrics module for accuracy calculation

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')

In [50]:
print('Accuracy: ', scores.mean())

Accuracy:  0.9099999999999999


In [51]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import ShuffleSplit

In [73]:
def evaluation_process(X_train, y_train, n_folds, shuffled):
    
    seed = np.random.seed(42)
    models = []
    # Create Decision Tree classifer object
    dt = DecisionTreeClassifier()
    # Train Decision Tree Classifer
    dtsc = cross_val_score(dt, X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    models.append(dtsc)
    return (cross_val_score(dt, X_train, y_train, cv=n_folds, scoring='accuracy').mean())
        
    # Create Logistic Regression classifer object
    lr = LogisticRegression()
    # Train Logistic Regression Classifer
    lrsc = cross_val_score(lr, X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    models.append(lrsc)
    return (cross_val_score(lr, X_train, y_train, cv=n_folds, scoring='accuracy').mean())
    
    
    # Create Random Forest Classifier object
    rf = RandomForestClassifier()
    # Train Random Forest Classifier
    rfsc = cross_val_score(rf, X_train, y_train, cv=n_folds, scoring='accuracy').mean()
    models.append(rfsc)
    return (cross_val_score(rf, X_train, y_train, cv=n_folds, scoring='accuracy').mean())
    st = StratifiedKFold(n_splits=n_folds,shuffle=shuffled,random_state=seed)
    
    dt_1 = DecisionTreeClassifier()
    dtsc_1 = []
    for (train, test), i in zip(st.split(X_train, y_train), range(10)):
    dt_1.fit(X.iloc[train], y.iloc[train])
    score_3 = roc_auc_score(y.iloc[test],dt_1.predict(X.iloc[test]))
    dtsc_1.append(score_3)
    models.append(np.mean(dtsc_1))
    return (np.mean(dtsc_1))
    
    
        # model
        lr_1 = LogisticRegression(C=2)
        lrsc_1 =[]
        for train_index,test_index in st.split(X_train, y_train):
            xtr,xvl = X_train.loc[train_index],X_train.loc[test_index]
            ytr,yvl = y_train.loc[train_index],y_train.loc[test_index]
        
            lr_1.fit(xtr,ytr)
            score_1 = roc_auc_score(yvl,lr_1.predict(xvl))
            lrsc_1.append(score_1)
        models.append(np.mean(lrsc_1))
        return (np.mean(lrsc_1))    
    
    
        # model
        rf_1 = RandomForestClassifier()
        rfsc_1 = []
        for (train, test), i in zip(st.split(X_train, y_train), range(10)):
            rf_1.fit(X.iloc[train], y.iloc[train])
            score_2 = roc_auc_score(y.iloc[test],rf_1.predict(X.iloc[test]))
            rfsc_1.append(score_2)
        models.append(np.mean(rfsc_1))
        return (np.mean(rfsc_1))    
    
    rs = ShuffleSplit(n_splits=n_folds, test_size=.30, random_state=seed)
    
        dt_2 = DecisionTreeClassifier()
        dtsc_2 = []
        for (train, test), i in zip(rs.split(X_train, y_train), range(10)):
            dt_2.fit(X.iloc[train], y.iloc[train])
            score_4 = roc_auc_score(y.iloc[test],dt_2.predict(X.iloc[test]))
            dtsc_2.append(score_4)
        models.append(np.mean(dtsc_2))
        return (np.mean(dtsc_2))
    
    
        lr_2 = LogisticRegression(C=2)
        lrsc_2 = []
        for (train, test), i in zip(rs.split(X_train, y_train), range(10)):
            lr_2.fit(X.iloc[train], y.iloc[train])
            score_5 = roc_auc_score(y.iloc[test],lr_2.predict(X.iloc[test]))
            lrsc_2.append(score_5)
        models.append(np.mean(lrsc_2))
        return (np.mean(lrsc_2))
    
    
        rf_2 = RandomForestClassifier()
        rfsc_2 = []
        for (train, test), i in zip(rs.split(X_train, y_train), range(10)):
            rf_2.fit(X.iloc[train], y.iloc[train])
            score_6 = roc_auc_score(y.iloc[test],rf_2.predict(X.iloc[test]))
            rfsc_2.append(score_6)
        models.append(np.mean(rfsc_2))
        return (np.mean(rfsc_2))
    
    best = n.index(max(n))
    models = [dt,lr,rf,dt_1,lr_1,rf_1,dt_2,lr_2,rf_2]
    best_model = models[best]
    
    return model[a]

IndentationError: expected an indented block (<ipython-input-73-c1dbf272837d>, line 31)

In [71]:
print(evaluation_process(scaled_X, y, 10, True))

2
0.950804276017104


In [59]:
def get_evaluation_test_set(model, X_test, y_test):
    
    np.random.seed(42)
    accuracy = 0
    precision = 0
    auroc = 0
    
    return accuracy, precision, auroc

get_evaluation_test_set(best_model,X_test, y_test) == (0.9966666666666667, 1.0, 0.9879227053140096) #Should equal true

NameError: name 'best_model' is not defined