In [32]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing

import warnings
warnings.simplefilter(action='ignore')

import random

SEED = 42
random.seed(SEED)

In [33]:
# add logger and timer decorators

from functools import wraps


def my_logger(orig_func):
    import logging
    logging.basicConfig(filename='{}.log'.format(orig_func.__name__), level=logging.INFO)

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        logging.info(
            'Ran with args: {}, and kwargs: {}'.format(args, kwargs))
        return orig_func(*args, **kwargs)

    return wrapper


def my_timer(orig_func):
    import time

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = orig_func(*args, **kwargs)
        t2 = time.time() - t1
        print('{} ran in: {} sec'.format(orig_func.__name__, t2))
        return result

    return wrapper

In [34]:

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [35]:
train_data = train_df.copy()
train_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
train_data["Embarked"].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True)
train_data.drop('Cabin', axis=1, inplace=True)


In [36]:
train_data['TravelAlone']=np.where((train_data["SibSp"]+train_data["Parch"])>0, 0, 1)
train_data.drop('SibSp', axis=1, inplace=True)
train_data.drop('Parch', axis=1, inplace=True)

In [37]:
#create categorical variables and drop some variables
training=pd.get_dummies(train_data, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

final_train = training

In [38]:
test_data = test_df.copy()
test_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
test_data["Fare"].fillna(train_df["Fare"].median(skipna=True), inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

test_data['TravelAlone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)

test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)

testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing

In [39]:
final_train['IsMinor']=np.where(final_train['Age']<=16, 1, 0)
final_test['IsMinor']=np.where(final_test['Age']<=16, 1, 0)

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

cols = ["Age","Fare","TravelAlone","Pclass_1","Pclass_2","Embarked_C","Embarked_S","Sex_male","IsMinor"] 
X = final_train[cols]
y = final_train['Survived']
model = LogisticRegression()
rfe = RFE(model, 8)
rfe = rfe.fit(X, y)

In [41]:
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring='accuracy')
rfecv.fit(X, y)

RFECV(cv=10, estimator=LogisticRegression(), scoring='accuracy')

As we see, eight variables were kept. 

In [42]:
Selected_features = ['Age', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Embarked_C', 
                     'Embarked_S', 'Sex_male', 'IsMinor']

In [43]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, log_loss

In [44]:

# custom class to use decorators my_logger and my_timer for logging and testing

class TitanicLogisticRegression(object):
    
    @my_logger
    @my_timer
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train, self.y_train, self.X_test, self.y_test = X_train, y_train, X_test, y_test
        self.clf = LogisticRegression(random_state=SEED)
        
    @my_logger
    @my_timer
    def fit(self, *args, **kwargs):
        self.clf.fit(self.X_train, self.y_train)
        y_pred = self.clf.predict(self.X_train)
        self.train_accuracy = accuracy_score(self.y_train, y_pred)
        return self.train_accuracy
    
    @my_logger
    @my_timer
    def predict(self, *args, **kwargs):
        y_pred = self.clf.predict(self.X_test)
        self.test_accuracy = accuracy_score(self.y_test, y_pred)
        self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
        self.report = classification_report(self.y_test, y_pred)
        return self.test_accuracy
    
    def get_report(self):
        return self.report
    
    def get_confusion_matrix(self):
        return self.confusion_matrix

In [45]:

# create X (features) and y (response)
X = final_train[Selected_features]
y = final_train['Survived']

# use train/test split with different random_state values
# we can change the random_state values that changes the accuracy scores
# the scores change a lot, this is why testing scores is a high-variance estimate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Use the custom class
logreg = TitanicLogisticRegression(X_train, X_test, y_train, y_test)
train_acc = logreg.fit()
test_acc = logreg.predict()

print(f"Training Accuracy => {train_acc}")
print(f"Test Accuracy => {test_acc}")

__init__ ran in: 1.9073486328125e-05 sec
fit ran in: 0.03681492805480957 sec
predict ran in: 0.009638309478759766 sec
Training Accuracy => 0.800561797752809
Test Accuracy => 0.7988826815642458


# Test Cases

In [46]:
import unittest
import time

class TestInput(unittest.TestCase):
  
    @classmethod
    def setUpClass(cls):
        pass

    @classmethod
    def tearDownClass(cls): 
        pass

    def setUp(self):
        self.test_accuracy = 0.7988826815642458
        self.test_run_duration = 0.15  # average test duration in seconds

    def tearDown(self):
        pass
    
    @my_timer
    @my_logger
    def test_fit(self):
        t1 = time.time()
        self.ta = TitanicLogisticRegression(X_train, X_test, y_train, y_test)
        self.ta.fit()
        t2 = time.time() - t1
        
        # grenzwert 120%
        limit_value = t2 * 1.2
        self.assertGreater(self.test_run_duration, limit_value, "Limitvalue (Grenzwert) need to be lower than the actual test duration in order for the test to succeed")
  
    def test_predict(self):
        # test if prediction accuracy is equal to the obtained test accuracy
        self.ta = TitanicLogisticRegression(X_train, X_test, y_train, y_test)
        self.ta.fit()
        test_acc = self.ta.predict()
        self.assertEqual(test_acc, self.test_accuracy)
      
    
unittest.main(argv=['first-arg-is-ignored'], exit=False)

.

__init__ ran in: 2.1457672119140625e-05 sec
fit ran in: 0.047261714935302734 sec
test_fit ran in: 0.07599425315856934 sec
__init__ ran in: 1.9073486328125e-05 sec


.

fit ran in: 0.05874013900756836 sec
predict ran in: 0.006888628005981445 sec



----------------------------------------------------------------------
Ran 2 tests in 0.162s

OK


<unittest.main.TestProgram at 0x7f977f844cd0>