In [365]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [366]:
titanic_df = pd.read_csv('datasets/titanic/processed/train_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,24.0,0,0,69.3,True,False,False
1,0,3,0,30.5,0,0,7.75,False,True,False
2,1,2,1,32.0,1,0,26.0,False,False,True
3,0,2,1,51.0,0,0,12.525,False,False,True
4,0,3,1,24.5,0,0,8.05,False,False,True


In [367]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [368]:
result_dict = {}

In [369]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test,y_pred, normalize=True)
    num_acc = accuracy_score(y_test,y_pred, normalize=False)

    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)

    return{'accuracy': acc,
           'precision': prec,
           'recall': recall,
           'accuracy_count':num_acc}

In [370]:
def build_model(classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)

    model = classifier_fn(x_train, y_train)

    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)

    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)

    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

    return {'training':train_summary,
            'test': test_summary,
            'confusion-matrix': model_crosstab}

In [371]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [372]:
def logistic_fn(x_train,y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train,y_train)

    return model

In [373]:
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                                 'Survived',
                                                 FEATURES,
                                                 titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7647058823529411
recall 0.6872246696035242
accuracy_count 450.0

Test data
accuracy 0.7972027972027972
precision 0.8076923076923077
recall 0.6885245901639344
accuracy_count 114.0



In [374]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train,y_train)

    return model

In [375]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                     'Survived',
                                                                     FEATURES[0:-1],
                                                                     titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7647058823529411
recall 0.6872246696035242
accuracy_count 450.0

Test data
accuracy 0.7972027972027972
precision 0.8076923076923077
recall 0.6885245901639344
accuracy_count 114.0

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7880184331797235
recall 0.7184873949579832
accuracy_count 456.0

Test data
accuracy 0.7622377622377622
precision 0.66
recall 0.66
accuracy_count 109.0



In [376]:
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train,y_train)

    return model

In [377]:
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                     FEATURES[0:-1],
                                                                     titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7647058823529411
recall 0.6872246696035242
accuracy_count 450.0

Test data
accuracy 0.7972027972027972
precision 0.8076923076923077
recall 0.6885245901639344
accuracy_count 114.0

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7880184331797235
recall 0.7184873949579832
accuracy_count 456.0

Test data
accuracy 0.7622377622377622
precision 0.66
recall 0.66
accuracy_count 109.0

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.8066783831282952
precision 0.7857142857142857
recall 0.717391304347826
accuracy_count 459.0

Test data
accuracy 0.8041958041958042
precision 0.8
recall 0.6896551724137931
accuracy_count 115.0



In [378]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):

    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train,y_train)

    return model

In [379]:
result_dict['survived ~ sgd'] = build_model(sgd_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7647058823529411
recall 0.6872246696035242
accuracy_count 450.0

Test data
accuracy 0.7972027972027972
precision 0.8076923076923077
recall 0.6885245901639344
accuracy_count 114.0

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7880184331797235
recall 0.7184873949579832
accuracy_count 456.0

Test data
accuracy 0.7622377622377622
precision 0.66
recall 0.66
accuracy_count 109.0

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.8066783831282952
precision 0.7857142857142857
recall 0.717391304347826
accuracy_count 459.0

Test data
accuracy 0.8041958041958042
precision 0.8
recall 0.6896551724137931
accuracy_count 115.0

Classification:  survived ~ sgd

Training data
accuracy 0.7451669595782073
precision 0.7784810126582279
recall 0.5278969957081545
accuracy_count 424.0

Test data
accuracy 0.720279720279

In [380]:
def linear_svc_fn(x_train,y_train, C=1.0, max_iter=10000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train,y_train)

    return model

In [381]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7647058823529411
recall 0.6872246696035242
accuracy_count 450.0

Test data
accuracy 0.7972027972027972
precision 0.8076923076923077
recall 0.6885245901639344
accuracy_count 114.0

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7880184331797235
recall 0.7184873949579832
accuracy_count 456.0

Test data
accuracy 0.7622377622377622
precision 0.66
recall 0.66
accuracy_count 109.0

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.8066783831282952
precision 0.7857142857142857
recall 0.717391304347826
accuracy_count 459.0

Test data
accuracy 0.8041958041958042
precision 0.8
recall 0.6896551724137931
accuracy_count 115.0

Classification:  survived ~ sgd

Training data
accuracy 0.7451669595782073
precision 0.7784810126582279
recall 0.5278969957081545
accuracy_count 424.0

Test data
accuracy 0.720279720279