In [15]:
%load_ext autoreload
%autoreload 2

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, roc_curve, auc
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from input_sampling import InputSampler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# Set up testing dataframes

"""                                    Data Dictionary
Variable	    Definition	                                        Key
Survived	    Survival	                                        0 = No, 1 = Yes
Pclass	        Ticket class	                                    1 = 1st, 2 = 2nd, 3 = 3rd
Name            Passenger Name
Sex	            Sex
Age	            Age in years	
SibSp	        # of siblings / spouses aboard the Titanic	
Parch	        # of parents / children aboard the Titanic	
Ticket	        Ticket number	
Fare	        Passenger fare	
Cabin	        Cabin number	
Embarked	    Port of Embarkation                                 C = Cherbourg, Q = Queenstown, S = Southampton
"""

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

cat = ['Sex', 'Embarked']
num = ['Pclass', 'Age', 'Parch', 'Fare']
y = train_set['Survived']
X = train_set[cat + num]

In [17]:
train_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [18]:
# Clean & reset variables
# Useful so I can play around with num, cat, X, y, and train_set as much as I want and quickly reset them for further tests

def reset_vars():
    global cat, num, y, X, train_set
    # Will deal with 'Ticket' and 'Cabin' columns later, because these columns will need to be classified. Possibly 'Name' as well
    train_set = pd.read_csv('train.csv')
    cat = ['Embarked']
    num = ['Pclass', 'Age', 'Parch', 'Fare']

# Test vars in logistic regression model
    y = train_set['Survived']
    X = train_set[cat + num]
    X = X.fillna({
    'Pclass': X['Pclass'].mean(),
    'Age': X['Age'].mean(),
    'Parch': X['Parch'].mean(),
    'Fare': X['Fare'].mean()
})

reset_vars()

In [19]:
# Preprocessor setup
# ColumnTransformer object for scaling numerical vars and encoding non-ordinal categorical vars

cat_trans = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_trans = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# update preprocessor with current 'cat' and 'num' inputs
def update_preprocessor():
    global num, cat
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_trans, num),
            ('cat', cat_trans, cat)
        ]
    )
    return preprocessor

preprocessor = update_preprocessor()

In [20]:
model_m = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(
                            n_estimators=1000,
                            bootstrap=False,
                            max_depth=20,
                            min_samples_split=10,
                            min_samples_leaf=5,
                            random_state=42
))])
model_f = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(
                            n_estimators=1000,
                            bootstrap=False,
                            max_depth=20,
                            min_samples_split=10,
                            min_samples_leaf=5,
                            random_state=42
))])


In [21]:
male_df = train_set[train_set['Sex'] == 'male']
male_df.reset_index()
X_male = male_df[cat+num]
X_male = X_male.fillna({
    'Pclass': X_male['Pclass'].mean(),
    'Age': X_male['Age'].mean(),
    'Parch': X_male['Parch'].mean(),
    'Fare': X_male['Fare'].mean()
})
y_male = male_df['Survived']

female_df = train_set[train_set['Sex'] == 'female']
female_df.reset_index()
X_female = female_df
X_female = X_female.fillna({
    'Pclass': X_female['Pclass'].mean(),
    'Age': X_female['Age'].mean(),
    'Parch': X_female['Parch'].mean(),
    'Fare': X_female['Fare'].mean()
})
y_female = female_df['Survived']


test_f = test_set[test_set['Sex'] == 'female']
test_set_f = test_f[cat+num]
test_set_f.reset_index()
test_set_f = test_set_f.fillna({
    'Pclass': test_set_f['Pclass'].mean(),
    'Age': test_set_f['Age'].mean(),
    'Parch': test_set_f['Parch'].mean(),
    'Fare': test_set_f['Fare'].mean()
})

test_m = test_set[test_set['Sex'] == 'male']
test_set_m = test_m[cat+num]
test_set_m.reset_index()
test_set_m = test_set_m.fillna({
    'Pclass': test_set_m['Pclass'].mean(),
    'Age': test_set_m['Age'].mean(),
    'Parch': test_set_m['Parch'].mean(),
    'Fare': test_set_m['Fare'].mean()
})

In [22]:
X_female, test_set_f

(     PassengerId  Survived  Pclass  \
 1              2         1       1   
 2              3         1       3   
 3              4         1       1   
 8              9         1       3   
 9             10         1       2   
 ..           ...       ...     ...   
 880          881         1       2   
 882          883         0       3   
 885          886         0       3   
 887          888         1       1   
 888          889         0       3   
 
                                                   Name     Sex        Age  \
 1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.000000   
 2                               Heikkinen, Miss. Laina  female  26.000000   
 3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.000000   
 8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.000000   
 9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.000000   
 ..                                                 ...     ..

In [23]:
model_f.fit(X_female, y_female)
fp = model_f.predict(test_set_f)

In [24]:
model_m.fit(X_male, y_male)
mp = model_m.predict(test_set_m)

In [25]:
m_pred = pd.DataFrame({'PassengerId': test_m['PassengerId'], 'Survived': mp})
f_pred = pd.DataFrame({'PassengerId': test_f['PassengerId'], 'Survived': fp})
m_pred, f_pred

(     PassengerId  Survived
 0            892         0
 2            894         0
 3            895         0
 5            897         0
 7            899         0
 ..           ...       ...
 407         1299         0
 413         1305         0
 415         1307         0
 416         1308         0
 417         1309         0
 
 [266 rows x 2 columns],
      PassengerId  Survived
 1            893         0
 4            896         1
 6            898         1
 8            900         1
 12           904         1
 ..           ...       ...
 409         1301         1
 410         1302         1
 411         1303         1
 412         1304         0
 414         1306         1
 
 [152 rows x 2 columns])

In [26]:
pred = pd.concat([m_pred, f_pred], axis=0).sort_index()

In [27]:
pred.to_csv(os.path.join('submissions', 'split_df_1.csv'), index=False)