# Importing Libraries

In [1]:
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import re
import csv

# Modifying Data Input

In [2]:
trainingData = pd.read_csv("../Data/train.csv")
trainingData["Cabin"] = trainingData["Cabin"].apply(str)
trainingData["Embarked"] = trainingData["Embarked"].apply(str)

AgeBins = pd.IntervalIndex.from_tuples([(0,15),(15,30),(30,45),(45,60),(60,75),(75,90),(90,105),(105,120)])
TicketBins = pd.IntervalIndex.from_tuples([(0,50000), (50000,100000), (100000,150000),(150000,200000),(200000, 250000), (250000, 300000), (300000, 350000)])
CostBins = pd.IntervalIndex.from_tuples([(-1, 50),(50,100), (100,150), (150, 200), (200, 250), (250, 300), (300,350), (350,400), (400, 450), (450, 500), (500, 550)])

trainingData["AgeGroup"] = pd.cut(trainingData['Age'], bins=AgeBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['TicketGroup'] = pd.cut(pd.to_numeric(trainingData['Ticket'], errors="coerce"), bins=TicketBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['CostGroup'] = pd.cut(trainingData['Fare'], bins=CostBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))

def nameClass(row):
    if (re.search("Mr\.",row["Name"])):
        return("Mr.")
    elif (re.search("Mrs\.",row["Name"])):
        return("Mrs.")
    elif (re.search("Miss\.",row["Name"])):
        return("Miss.")
    else:
        return("No Title")
        
trainingData["Title"] = trainingData.apply(lambda row: nameClass(row), axis=1)
trainingData.drop(["Name", "Age", "Ticket", "Fare"], axis=1,inplace=True)

features = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked", "AgeGroup", "TicketGroup", "CostGroup", "Title"]
results = trainingData["Survived"]
encoder = OrdinalEncoder()
encoder.fit(trainingData[features])
trainingData = encoder.transform(trainingData[features])

# CV-Sets

In [3]:
TrainingSets = []
TestSets = []
NUM_SETS = 5

for index in range(NUM_SETS):
    temp = []
    start = (NUM_SETS-index-1)*len(trainingData)//NUM_SETS
    end = (NUM_SETS-index)*len(trainingData)//NUM_SETS
    temp.append(np.append(trainingData[0:start],trainingData[end:], axis=0))
    temp.append(results[0:start].append(results[end:]))
    TrainingSets.append(temp)
    temp = []
    temp.append(trainingData[start:end])
    temp.append(results[start:end])
    TestSets.append(temp)

# Accuracy Function