In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score,f1_score,roc_auc_score

import warnings
warnings.filterwarnings('ignore')

#Loading Data 
df=pd.read_csv('train.csv')

#Preprocessing Steps 
le=LabelEncoder()
df['Sex']=le.fit_transform(df['Sex'])
df['Age']=df['Age'].fillna(df['Age'].median())
df=pd.get_dummies(df,columns=['Pclass'])
df.columns



features=['Pclass_1', 'Pclass_2',
       'Pclass_3','Age','Fare','Sex']
target='Survived'

X=df[features]
y=df[target]

#Using Stratified KFold ( random state is specified )
skf = StratifiedKFold(n_splits=5,random_state=0)

i=0

for train_index, test_index in skf.split(X,y):
    print ("Fold:",i)
    #print("TRAIN:", train_index, "TEST:", test_index)

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]


    #Introducing scaling 
    sc=MinMaxScaler()
    scaled_data=sc.fit(X_train)
    X_train_scaled=sc.transform(X_train)
    X_test_scaled=sc.transform(X_test)

    #Using Decision Tree ( max depth and random state specified )
    clf=DecisionTreeClassifier(max_depth=3,random_state=12)

    model=clf.fit(X_train_scaled,y_train)

    pred=model.predict(X_test_scaled)
    print ("recall : ",recall_score(pred,y_test))


    i+=1



#Same Process using Pipeline 

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

X=df[features]
y=df[target]

#Just giving one parameter ( max_depth as 3 ) similar to the exercise above 
parameters={'model__max_depth': [3]}

clf_tree=DecisionTreeClassifier(random_state=12) #Specifying random state to 12 

#Using a pipeline now to replicate the same process we did above 
pipeline = Pipeline([
 ('scaling', MinMaxScaler()),
 ('model',clf_tree)
])


#Using grid search 
#skf is used from above , the stratified Kfold technique 
grid_search = GridSearchCV(pipeline, parameters,scoring="recall",cv=skf,return_train_score=True)
grid_search.fit(X, y)


print ("\n\nGrid Search Results\nScores for 5 splits : ")
print( grid_search.cv_results_['split0_test_score'])
print( grid_search.cv_results_['split1_test_score'])
print( grid_search.cv_results_['split2_test_score'])
print( grid_search.cv_results_['split3_test_score'])
print( grid_search.cv_results_['split4_test_score'])