In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, roc_curve, auc
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from input_sampling import InputSampler

In [2]:
# Set up testing dataframes

"""                                    Data Dictionary
Variable	    Definition	                                        Key
Survived	    Survival	                                        0 = No, 1 = Yes
Pclass	        Ticket class	                                    1 = 1st, 2 = 2nd, 3 = 3rd
Name            Passenger Name
Sex	            Sex
Age	            Age in years	
SibSp	        # of siblings / spouses aboard the Titanic	
Parch	        # of parents / children aboard the Titanic	
Ticket	        Ticket number	
Fare	        Passenger fare	
Cabin	        Cabin number	
Embarked	    Port of Embarkation                                 C = Cherbourg, Q = Queenstown, S = Southampton
"""

train_set = pd.read_csv('train.csv')
train, test = train_test_split(train_set, random_state=42, test_size=0.25)

cat = ['Sex', 'Embarked']
num = ['Pclass', 'Age', 'Parch', 'Fare']
y = train_set['Survived']
X = train_set[cat + num]

In [3]:
# Clean & reset variables
# Useful so I can play around with num, cat, X, y, and train_set as much as I want and quickly reset them for further tests

def reset_vars():
    global cat, num, y, X, train_set
    # Will deal with 'Ticket' and 'Cabin' columns later, because these columns will need to be classified. Possibly 'Name' as well
    train_set = pd.read_csv('train.csv')
    cat = ['Sex', 'Embarked']
    num = ['Pclass', 'Age', 'Parch', 'Fare']

# Test vars in logistic regression model
    y = train_set['Survived']
    X = train_set[cat + num]
    X = X.fillna({
    'Pclass': X['Pclass'].mean(),
    'Age': X['Age'].mean(),
    'Parch': X['Parch'].mean(),
    'Fare': X['Fare'].mean()
})

reset_vars()

In [4]:
# Preprocessor setup
# ColumnTransformer object for scaling numerical vars and encoding non-ordinal categorical vars

cat_trans = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_trans = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# update preprocessor with current 'cat' and 'num' inputs
def update_preprocessor():
    global num, cat
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_trans, num),
            ('cat', cat_trans, cat)
        ]
    )
    return preprocessor

preprocessor = update_preprocessor()

In [6]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(
                            n_estimators=1000,
                            max_depth=150,
                            min_samples_split=50,
                            min_samples_leaf=25,
                            random_state=420
                        ))])

In [7]:
model.fit(X, y)

In [8]:
"""
# test for most accurate arguments for this set
def rf_arg_test(n, md, mss, msl):
    model_rf_test = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(
                                n_estimators=n,
                                max_depth=md,
                                min_samples_split=mss,
                                min_samples_leaf=msl,
                                random_state=42
                            ))])

    model_rf_test = model_rf_test.fit(X[cat+num], y)
    cv = cross_val_score(model_rf_test, X, y, cv=5)
    return [cv.mean(), cv.std(), [n, md, mss, msl]]
"""

"\n# test for most accurate arguments for this set\ndef rf_arg_test(n, md, mss, msl):\n    model_rf_test = Pipeline(steps=[('preprocessor', preprocessor),\n                            ('classifier', RandomForestClassifier(\n                                n_estimators=n,\n                                max_depth=md,\n                                min_samples_split=mss,\n                                min_samples_leaf=msl,\n                                random_state=42\n                            ))])\n\n    model_rf_test = model_rf_test.fit(X[cat+num], y)\n    cv = cross_val_score(model_rf_test, X, y, cv=5)\n    return [cv.mean(), cv.std(), [n, md, mss, msl]]\n"

In [9]:
# set up a basic list of best performers at different values of n, mf, mss, and msl
# grabs a filtered top-ish 20 cv means at 5 different values of n
# only ran once then saved to file

# 2 < optimal mss < 10
# 1 < optimal msl < 5

# md = 10, mss = 5, msl = 1 achieves the best mean at all values of n
# higher values for n tend to be higher accuracy, but n = 100 has the max score (probably fluke)

"""
rf_tests = []
for n in [100, 250, 500, 750, 1000]:
    md_tests = []
    print(n)
    for md in [5, 10, 20, 35, 50]:
        mss_tests = []
        for mss in [2, 5, 10, 20]:
            msl_tests = []
            
            for msl in [1, 2, 5, 10]:
                msl_tests.append(rf_arg_test(n, md, mss, msl))
            msl_tests.sort(key=lambda x: x[0], reverse=True)
            mss_tests.extend(msl_tests[0:int((len(msl_tests)/2))+1])
            
        mss_tests.sort(key=lambda x: x[0], reverse=True)
        md_tests.extend(mss_tests[0:int((len(mss_tests)/2))+1])
        print('md', md, md_tests)
        
    md_tests.sort(key=lambda x: x[0], reverse=True)
    rf_tests.extend(md_tests[0:int((len(md_tests)/2))])
    print('n', n, rf_tests)
    
rf_tests
"""


"""
rf_tests.sort(key=lambda x: x[0], reverse=True)
rf_tests
with open('rf_tests', 'w') as f:
    for idx in range(len(rf_tests)):
        f.write(str(idx)+'\n\t'+str(rf_tests[idx][0])+'\n\t'+str(rf_tests[idx][1])+'\n\t\t'+str(rf_tests[idx][2][0])+'\n\t\t'+str(rf_tests[idx][2][1])+'\n\t\t'+str(rf_tests[idx][2][2])+'\n\t\t'+str(rf_tests[idx][2][3])+'\n')
"""

"\nrf_tests.sort(key=lambda x: x[0], reverse=True)\nrf_tests\nwith open('rf_tests', 'w') as f:\n    for idx in range(len(rf_tests)):\n        f.write(str(idx)+'\n\t'+str(rf_tests[idx][0])+'\n\t'+str(rf_tests[idx][1])+'\n\t\t'+str(rf_tests[idx][2][0])+'\n\t\t'+str(rf_tests[idx][2][1])+'\n\t\t'+str(rf_tests[idx][2][2])+'\n\t\t'+str(rf_tests[idx][2][3])+'\n')\n"

In [10]:
test_set = test_set.fillna({
    'Pclass': 0,
    'Age': 0,
    'Parch': 0,
    'Fare': 0,
    'Sex': 'unknown',
    'Embarked': 'unknown'
})

NameError: name 'test_set' is not defined

In [None]:
"""
rf_tests_dict = {'cv_mean':[], 'cv_std':[], 'n':[], 'md':[], 'mss':[], 'msl':[] }
for test in rf_tests:
    rf_tests_dict['cv_mean'].append(test[0])
    rf_tests_dict['cv_std'].append(test[1])
    rf_tests_dict['n'].append(test[2][0])
    rf_tests_dict['md'].append(test[2][1])
    rf_tests_dict['mss'].append(test[2][2])
    rf_tests_dict['msl'].append(test[2][3])
rf_tests_dict
"""

In [None]:
# rf_tests_df.to_csv('rf_tests.csv')

In [None]:
"""
rf_tests_df = pd.DataFrame({'cv_mean': rf_tests_dict['cv_mean'], 
                           'cv_std': rf_tests_dict['cv_std'], 
                           'n': rf_tests_dict['n'], 
                           'md': rf_tests_dict['md'], 
                           'mss': rf_tests_dict['mss'], 
                           'msl': rf_tests_dict['msl']})
rf_tests_df
"""

In [None]:
"""
print(rf_tests_df.groupby('md').size())
print(rf_tests_df.groupby('mss').size())
print(rf_tests_df.groupby('msl').size())

adjusted_rf_df = rf_tests_df[(rf_tests_df['mss'] != 10) & (rf_tests_df['msl'] != 5)] 
adjusted_rf_df
"""

In [None]:
X_test = test_set[cat + num]
predictions = model.predict_proba(X_test)
predictions

In [None]:
pred_df = pd.DataFrame({
    'PassengerId':test_set['PassengerId'],
    'Survived':predictions
})

In [None]:
pred_df.to_csv('draft_submission_rf.csv', index=False) # .74401