In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
datasets = pd.read_csv('../../../DataSets/titanic_processed.csv')

In [3]:
datasets.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,14.0,0,0,7.8542,0,0,1
1,1,1,1,28.0,0,0,26.55,0,0,1
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,3,1,17.0,1,0,7.0542,0,0,1
4,0,3,1,4.0,4,2,31.275,0,0,1


In [4]:
datasets.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
707,1,1,0,42.0,0,0,227.525,1,0,0
708,1,2,1,3.0,1,1,26.0,0,0,1
709,1,1,0,22.0,0,2,49.5,1,0,0
710,0,3,0,47.0,1,0,14.5,0,0,1
711,1,1,1,45.0,0,0,26.55,0,0,1


In [5]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    712 non-null    int64  
 1   Pclass      712 non-null    int64  
 2   Sex         712 non-null    int64  
 3   Age         712 non-null    float64
 4   SibSp       712 non-null    int64  
 5   Parch       712 non-null    int64  
 6   Fare        712 non-null    float64
 7   Embarked_C  712 non-null    int64  
 8   Embarked_Q  712 non-null    int64  
 9   Embarked_S  712 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 55.8 KB


In [6]:
datasets[datasets.isnull().any(axis = 1)].count()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [7]:
feature = list(datasets.columns[1:])
feature

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [8]:
result_dict = {}

In [9]:
def summarize_classification(test_y, y_predict):
    acc = accuracy_score(test_y, y_predict, normalize=True)
    prec = precision_score(test_y, y_predict)
    recall = recall_score(test_y, y_predict)
    num_acc = accuracy_score(test_y, y_predict, normalize=False)
    
    return {'Accuracy':acc,
           'Precision':prec,
           'Recall':recall,
           'Number of accuracy':num_acc}



In [10]:
def build_model(classifier_function, name_of_y_col, name_of_x_col, datasets, test_frac = 0.2):
    
    x = datasets[name_of_x_col]
    y = datasets[name_of_y_col]
    
    train_x, test_x, train_y, test_y = train_test_split(x, y,test_size = test_frac)
    
    model = classifier_function(train_x, train_y)
    
    y_pred = model.predict(test_x)
    y_pred_train = model.predict(train_x)
    
    train_summary = summarize_classification(train_y, y_pred_train)
    test_summary = summarize_classification(test_y,y_pred)
    
    pred_results = pd.DataFrame({'y_test':test_y,
                                'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    
    return {'training':train_summary,
           'test':test_summary,
           'Confusion Matrix':model_crosstab}

In [11]:
def compare_results():
    
    for key in result_dict:
        print('Classification: ', key)
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [12]:
def logistic_function(train_x, train_y):
    model = LogisticRegression(solver = 'liblinear')
    model.fit(train_x, train_y)
    
    return model

In [13]:
result_dict['survived ~ logistic'] = build_model(logistic_function, 'Survived',feature, datasets)

In [14]:
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119



In [15]:
def linear_discriminatic_function(train_x, train_y, solver = 'svd'):
    model = LinearDiscriminantAnalysis(solver = solver)
    model.fit(train_x, train_y)
    
    return model

In [16]:
result_dict['survived ~ linear_discriminatic_function'] = build_model(linear_discriminatic_function, 'Survived', feature, datasets)
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114



In [18]:
def quardratic_discriminatic_funtion(train_x, train_y):
    model = QuadraticDiscriminantAnalysis()
    model.fit(train_x, train_y)
    
    return model

In [21]:
result_dict['survived ~ quardratic_discriminatic_function'] = build_model(quardratic_discriminatic_funtion,
                                                                          'Survived',
                                                                          feature[:-1],
                                                                          datasets)
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114

Classification:  survived ~ quardratic_discriminatic_function

Training data
Accuracy 0.8084358523725835
Precision 0.775609756097561
Recall 0.7162162162162162
Number of accuracy 460

Test data
Accuracy 0.7762237762237763
Precision 0.84
Recall 0.6363636363636364
Number of accuracy 111



In [22]:
def sgd_function(train_x, train_y,max_iter = 1000, tol = 1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(train_x, train_y)
    
    return model

In [23]:
result_dict['survived ~ sgd_function'] = build_model(sgd_function,'Survived',feature,datasets)

In [24]:
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114

Classification:  survived ~ quardratic_discriminatic_function

Training data
Accuracy 0.8084358523725835
Precision 0.775609756097561
Recall 0.7162162162162162
Number of accuracy 460

Test data
Accuracy 0.7762237762237763
Precision 0.84
Recall 0.6363636363636364
Number of accuracy 111

Classification:  survived ~ sgd_function

Training data
Accuracy 0.7855887521968365
Precision 0.7209302325581395
Recall 0.788135593220339
Number of accuracy 447

Test 

In [25]:
def radious_neighbor(train_x, train_y, radius = 40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(train_x, train_y)
    
    return model

In [26]:
result_dict['survived ~ radious_neighbor'] = build_model(radious_neighbor, 'Survived',
                                                        feature, datasets)
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114

Classification:  survived ~ quardratic_discriminatic_function

Training data
Accuracy 0.8084358523725835
Precision 0.775609756097561
Recall 0.7162162162162162
Number of accuracy 460

Test data
Accuracy 0.7762237762237763
Precision 0.84
Recall 0.6363636363636364
Number of accuracy 111

Classification:  survived ~ sgd_function

Training data
Accuracy 0.7855887521968365
Precision 0.7209302325581395
Recall 0.788135593220339
Number of accuracy 447

Test 

In [27]:
def decision_tree_function(train_x, train_y, max_depth = None, max_feature = None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_feature)
    model.fit(train_x, train_y)
    
    return model

In [28]:
result_dict['survived ~ decision_tree_function'] = build_model(decision_tree_function,
                                                              'Survived',
                                                              feature,
                                                              datasets)
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114

Classification:  survived ~ quardratic_discriminatic_function

Training data
Accuracy 0.8084358523725835
Precision 0.775609756097561
Recall 0.7162162162162162
Number of accuracy 460

Test data
Accuracy 0.7762237762237763
Precision 0.84
Recall 0.6363636363636364
Number of accuracy 111

Classification:  survived ~ sgd_function

Training data
Accuracy 0.7855887521968365
Precision 0.7209302325581395
Recall 0.788135593220339
Number of accuracy 447

Test 

In [29]:
def naive_bayes_function(train_x, train_y, priors = None):
    model = GaussianNB(priors=priors)
    model.fit(train_x, train_y)
    
    return model

In [30]:
result_dict['survived ~ naive_bayes_function'] = build_model(naive_bayes_function,
                                                            'Survived',
                                                            feature,
                                                            datasets)
compare_results()

Classification:  survived ~ logistic

Training data
Accuracy 0.7943760984182777
Precision 0.7821782178217822
Recall 0.683982683982684
Number of accuracy 452

Test data
Accuracy 0.8321678321678322
Precision 0.8113207547169812
Recall 0.7543859649122807
Number of accuracy 119

Classification:  survived ~ linear_discriminatic_function

Training data
Accuracy 0.7926186291739895
Precision 0.7699115044247787
Recall 0.725
Number of accuracy 451

Test data
Accuracy 0.7972027972027972
Precision 0.7021276595744681
Recall 0.6875
Number of accuracy 114

Classification:  survived ~ quardratic_discriminatic_function

Training data
Accuracy 0.8084358523725835
Precision 0.775609756097561
Recall 0.7162162162162162
Number of accuracy 460

Test data
Accuracy 0.7762237762237763
Precision 0.84
Recall 0.6363636363636364
Number of accuracy 111

Classification:  survived ~ sgd_function

Training data
Accuracy 0.7855887521968365
Precision 0.7209302325581395
Recall 0.788135593220339
Number of accuracy 447

Test 