In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, roc_curve, auc
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

In [183]:
# Set up testing dataframes

"""                                    Data Dictionary
Variable	    Definition	                                        Key
Survived	    Survival	                                        0 = No, 1 = Yes
Pclass	        Ticket class	                                    1 = 1st, 2 = 2nd, 3 = 3rd
Name            Passenger Name
Sex	            Sex
Age	            Age in years	
SibSp	        # of siblings / spouses aboard the Titanic	
Parch	        # of parents / children aboard the Titanic	
Ticket	        Ticket number	
Fare	        Passenger fare	
Cabin	        Cabin number	
Embarked	    Port of Embarkation                                 C = Cherbourg, Q = Queenstown, S = Southampton
"""

train_set = pd.read_csv('train.csv')
train, test = train_test_split(train_set, random_state=42, test_size=0.25)

In [184]:
# Will deal with 'Ticket' and 'Cabin' columns later, because these columns will need to be classified. Possibly 'Name' as well
cat = ['Sex', 'Embarked']
num = ['Pclass', 'Age', 'Parch', 'Fare']

# Test vars in logistic regression model
y = train_set['Survived']
X = train_set[cat + num]
X = X.fillna({
    'Pclass': X['Pclass'].mean(),
    'Age': X['Age'].mean(),
    'Parch': X['Parch'].mean(),
    'Fare': X['Fare'].mean()
})

In [240]:
cat_trans = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_trans = Pipeline(steps=[
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_trans, num),
        ('cat', cat_trans, cat)
    ]
)

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(
                            n_estimators=1000,
                            max_depth=150,
                            min_samples_split=50,
                            min_samples_leaf=25,
                            random_state=420
                        ))])


model = model.fit(X[cat+num], y)
cross_val_score(model, X, y, cv=5)

array([0.80446927, 0.80337079, 0.79213483, 0.75280899, 0.78651685])

In [230]:
"""
# test for most accurate arguments for this set
def rf_arg_test(n, md, mss, msl):
    model_rf_test = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(
                                n_estimators=n,
                                max_depth=md,
                                min_samples_split=mss,
                                min_samples_leaf=msl,
                                random_state=42
                            ))])

    model_rf_test = model_rf_test.fit(X[cat+num], y)
    cv = cross_val_score(model_rf_test, X, y, cv=5)
    return [cv.mean(), cv.std(), [n, md, mss, msl]]
"""

"\n# test for most accurate arguments for this set\ndef rf_arg_test(n, md, mss, msl):\n    model_rf_test = Pipeline(steps=[('preprocessor', preprocessor),\n                            ('classifier', RandomForestClassifier(\n                                n_estimators=n,\n                                max_depth=md,\n                                min_samples_split=mss,\n                                min_samples_leaf=msl,\n                                random_state=42\n                            ))])\n\n    model_rf_test = model_rf_test.fit(X[cat+num], y)\n    cv = cross_val_score(model_rf_test, X, y, cv=5)\n    return [cv.mean(), cv.std(), [n, md, mss, msl]]\n"

In [198]:
# set up a basic list of best performers at different values of n, mf, mss, and msl
# grabs a filtered top-ish 20 cv means at 5 different values of n
# only ran once then saved to file

# 2 < optimal mss < 10
# 1 < optimal msl < 5

# md = 10, mss = 5, msl = 1 achieves the best mean at all values of n
# higher values for n tend to be higher accuracy, but n = 100 has the max score (probably fluke)

"""
rf_tests = []
for n in [100, 250, 500, 750, 1000]:
    md_tests = []
    print(n)
    for md in [5, 10, 20, 35, 50]:
        mss_tests = []
        for mss in [2, 5, 10, 20]:
            msl_tests = []
            
            for msl in [1, 2, 5, 10]:
                msl_tests.append(rf_arg_test(n, md, mss, msl))
            msl_tests.sort(key=lambda x: x[0], reverse=True)
            mss_tests.extend(msl_tests[0:int((len(msl_tests)/2))+1])
            
        mss_tests.sort(key=lambda x: x[0], reverse=True)
        md_tests.extend(mss_tests[0:int((len(mss_tests)/2))+1])
        print('md', md, md_tests)
        
    md_tests.sort(key=lambda x: x[0], reverse=True)
    rf_tests.extend(md_tests[0:int((len(md_tests)/2))])
    print('n', n, rf_tests)
    
rf_tests
"""


"""
rf_tests.sort(key=lambda x: x[0], reverse=True)
rf_tests
with open('rf_tests', 'w') as f:
    for idx in range(len(rf_tests)):
        f.write(str(idx)+'\n\t'+str(rf_tests[idx][0])+'\n\t'+str(rf_tests[idx][1])+'\n\t\t'+str(rf_tests[idx][2][0])+'\n\t\t'+str(rf_tests[idx][2][1])+'\n\t\t'+str(rf_tests[idx][2][2])+'\n\t\t'+str(rf_tests[idx][2][3])+'\n')
"""

100
md 5 [[0.815931203314293, 0.014013808356509476, [100, 5, 2, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 5, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 10, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 20, 10]], [0.8159249262444291, 0.014089246518868186, [100, 5, 5, 2]], [0.8148138848785388, 0.012315342326766726, [100, 5, 2, 1]], [0.8148076078086749, 0.013385203525114573, [100, 5, 2, 2]]]
md 10 [[0.815931203314293, 0.014013808356509476, [100, 5, 2, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 5, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 10, 10]], [0.815931203314293, 0.014013808356509476, [100, 5, 20, 10]], [0.8159249262444291, 0.014089246518868186, [100, 5, 5, 2]], [0.8148138848785388, 0.012315342326766726, [100, 5, 2, 1]], [0.8148076078086749, 0.013385203525114573, [100, 5, 2, 2]], [0.8361559224154165, 0.022164249247972938, [100, 10, 5, 1]], [0.8305442219571904, 0.019427935836628597, [100, 10, 5, 2]], [0.8294331805913,

[[0.8361559224154165, 0.022164249247972938, [100, 10, 5, 1]],
 [0.8305442219571904, 0.019427935836628597, [100, 10, 5, 2]],
 [0.8294331805913, 0.027686052534189844, [100, 10, 2, 2]],
 [0.8271797125102003, 0.02482271702294467, [100, 35, 2, 2]],
 [0.8271797125102003, 0.02482271702294467, [100, 50, 2, 2]],
 [0.8260561170045821, 0.02368751432505222, [100, 20, 2, 2]],
 [0.8260561170045821, 0.02669446306125944, [100, 35, 5, 2]],
 [0.8260561170045821, 0.02669446306125944, [100, 50, 5, 2]],
 [0.8249513527085556, 0.027110617507397038, [100, 10, 2, 1]],
 [0.824938798568828, 0.027447089301043445, [100, 20, 5, 2]],
 [0.8238277572029377, 0.02646050685870535, [100, 20, 5, 1]],
 [0.8227041616973196, 0.024744851114454413, [100, 35, 5, 1]],
 [0.8227041616973196, 0.024744851114454413, [100, 50, 5, 1]],
 [0.822691607557592, 0.01911780119135707, [100, 20, 2, 5]],
 [0.822691607557592, 0.01911780119135707, [100, 20, 5, 5]],
 [0.822691607557592, 0.01911780119135707, [100, 20, 10, 5]],
 [0.822691607557592, 0.

In [None]:
test_set = test_set.fillna({
    'Pclass': 0,
    'Age': 0,
    'Parch': 0,
    'Fare': 0,
    'Sex': 'unknown',
    'Embarked': 'unknown'
})

In [219]:
"""
rf_tests_dict = {'cv_mean':[], 'cv_std':[], 'n':[], 'md':[], 'mss':[], 'msl':[] }
for test in rf_tests:
    rf_tests_dict['cv_mean'].append(test[0])
    rf_tests_dict['cv_std'].append(test[1])
    rf_tests_dict['n'].append(test[2][0])
    rf_tests_dict['md'].append(test[2][1])
    rf_tests_dict['mss'].append(test[2][2])
    rf_tests_dict['msl'].append(test[2][3])
rf_tests_dict
"""

{'cv_mean': [0.8361559224154165,
  0.8361559224154164,
  0.8327914129684263,
  0.8316678174628084,
  0.8316615403929445,
  0.8316615403929445,
  0.8316615403929445,
  0.8316615403929445,
  0.8305504990270542,
  0.8305504990270542,
  0.8305504990270542,
  0.8305504990270542,
  0.830550499027054,
  0.830550499027054,
  0.830550499027054,
  0.8305442219571904,
  0.8305442219571904,
  0.8305379448873266,
  0.8305379448873266,
  0.8294331805913,
  0.8294331805913,
  0.8294331805913,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294206264515724,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8294143493817085,
  0.8283033080158182,
  0.8282970309459545,
  0.8282970309459545,
  0.8282970309459545,
  0.8282970309459545,
  0.8282970309459543,
  0.827185989580064,
  0.82718598

In [222]:
# rf_tests_df.to_csv('rf_tests.csv')

In [221]:
"""
rf_tests_df = pd.DataFrame({'cv_mean': rf_tests_dict['cv_mean'], 
                           'cv_std': rf_tests_dict['cv_std'], 
                           'n': rf_tests_dict['n'], 
                           'md': rf_tests_dict['md'], 
                           'mss': rf_tests_dict['mss'], 
                           'msl': rf_tests_dict['msl']})
rf_tests_df
"""

Unnamed: 0,cv_mean,cv_std,n,md,mss,msl
0,0.836156,0.022164,100,10,5,1
1,0.836156,0.024336,500,10,5,1
2,0.832791,0.024313,750,10,5,1
3,0.831668,0.025240,1000,10,5,1
4,0.831662,0.020917,250,10,5,1
...,...,...,...,...,...,...
80,0.822692,0.019118,100,35,2,5
81,0.821574,0.026724,250,10,10,1
82,0.821574,0.024763,250,10,10,2
83,0.821574,0.027424,250,20,10,1


In [227]:
"""
print(rf_tests_df.groupby('md').size())
print(rf_tests_df.groupby('mss').size())
print(rf_tests_df.groupby('msl').size())

adjusted_rf_df = rf_tests_df[(rf_tests_df['mss'] != 10) & (rf_tests_df['msl'] != 5)] 
adjusted_rf_df
"""

md
10    25
20    26
35    19
50    15
dtype: int64
mss
2     33
5     44
10     8
dtype: int64
msl
1    28
2    41
5    16
dtype: int64


Unnamed: 0,cv_mean,cv_std,n,md,mss,msl
0,0.836156,0.022164,100,10,5,1
1,0.836156,0.024336,500,10,5,1
2,0.832791,0.024313,750,10,5,1
3,0.831668,0.025240,1000,10,5,1
4,0.831662,0.020917,250,10,5,1
...,...,...,...,...,...,...
64,0.824939,0.027447,100,20,5,2
73,0.823828,0.026461,100,20,5,1
74,0.822704,0.024745,100,35,5,1
75,0.822704,0.024745,100,50,5,1


In [244]:
X_test = test_set[cat + num]
predictions = model.predict_proba(X_test)
predictions

array([[0.87628763, 0.12371237],
       [0.45808155, 0.54191845],
       [0.85802719, 0.14197281],
       [0.8672515 , 0.1327485 ],
       [0.48242504, 0.51757496],
       [0.79622486, 0.20377514],
       [0.37099909, 0.62900091],
       [0.72268069, 0.27731931],
       [0.35533773, 0.64466227],
       [0.86569869, 0.13430131],
       [0.75643349, 0.24356651],
       [0.75477323, 0.24522677],
       [0.09813758, 0.90186242],
       [0.85285751, 0.14714249],
       [0.09289379, 0.90710621],
       [0.13268099, 0.86731901],
       [0.85049631, 0.14950369],
       [0.82192221, 0.17807779],
       [0.47787061, 0.52212939],
       [0.36704515, 0.63295485],
       [0.62668749, 0.37331251],
       [0.73625162, 0.26374838],
       [0.14237851, 0.85762149],
       [0.59553521, 0.40446479],
       [0.12263376, 0.87736624],
       [0.87846186, 0.12153814],
       [0.09496962, 0.90503038],
       [0.8227022 , 0.1772978 ],
       [0.65299862, 0.34700138],
       [0.66707466, 0.33292534],
       [0.

In [242]:
pred_df = pd.DataFrame({
    'PassengerId':test_set['PassengerId'],
    'Survived':predictions
})

In [243]:
pred_df.to_csv('draft_submission_rf.csv', index=False)