In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, roc_curve, auc
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import random
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from input_sampling import InputSampler

In [2]:
cat = ['Sex', 'Embarked']
num = ['Pclass', 'Age', 'Parch', 'Fare', 'SibSp']
train_set = pd.read_csv('train.csv')
train_set = train_set.fillna({
    'Pclass': train_set['Pclass'].mean(),
    'Age': train_set['Age'].mean(),
    'Parch': train_set['Parch'].mean(),
    'Fare': train_set['Fare'].mean(),
    'SibSp': train_set['SibSp'].mean()
})

y = train_set['Survived']
X = train_set[cat + num]

In [17]:
transformers = []
transformers.append(('scaler', StandardScaler(), num))
transformers.append(('onehot', OneHotEncoder(), cat))
preprocessor = ColumnTransformer(transformers=transformers)
pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', RandomForestClassifier())
            ])
pipeline.set_params(classifier__bootstrap=False, classifier__max_depth=10, classifier__max_features='log2', classifier__min_samples_split=10, classifier__n_estimators=600, classifier__random_state=42)
pipeline.fit(X, y)

In [30]:
test_set = pd.read_csv('test.csv')
test_set = test_set.fillna({
    'Pclass': test_set['Pclass'].mean(),
    'Age': test_set['Age'].mean(),
    'Parch': test_set['Parch'].mean(),
    'Fare': test_set['Fare'].mean(),
    'SibSp': test_set['SibSp'].mean()
})
X_test = test_set[cat + num]
pred = pipeline.predict(X_test)
pred_df = pd.DataFrame({'PassengerId': test_set['PassengerId'], 'Survived': pred})

In [31]:
pred_df.to_csv(os.path.join('submissions', 'rf_optimized_1.csv'), index=False)

In [4]:
"""Best performer:
"Best performer:\n[[0.8324022346368715,\n  RandomizedSearchCV(cv=5,\n                     estimator=Pipeline(steps=[('preprocessor',\n                                                ColumnTransformer(transformers=[('scaler',\n                                                                                 StandardScaler(),\n                                                                                 ['Pclass',\n                                                                                  'Age',\n                                                                                  'Parch',\n                                                                                  'Fare']),\n                                                                                ('onehot',\n                                                                                 OneHotEncoder(),\n                                                                                 ['Sex',\n                                                                                  'Embarked'])])),\n                                               ('classifier',\n                                                RandomForestClassifier(random_state=42))]),\n                     n_iter=100, n_jobs=-1,\n                     param_distributions={'classifier__bootstrap': [True, False],\n                                          'classi...\n                                          'classifier__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000015A3FF2AA10>,\n                                          'classifier__min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000015A3FF2A510>,\n                                          'classifier__n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000015A3F883810>},\n                     random_state=42, verbose=2),\n  [['Pclass', 'Age', 'Parch', 'Fare'], ['Sex', 'Embarked']]]\n"
Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[0.8156424581005587, RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('scaler',
                                                                               StandardScaler(),
                                                                               ['Pclass',
                                                                                'Age',
                                                                                'Parch',
                                                                                'Fare']),
                                                                              ('onehot',
                                                                               OneHotEncoder(),
                                                                               ['Sex',
                                                                                'Embarked'])])),
                                             ('classifier',
                                              RandomForestClassifier(random_state=42))]),
                   n_iter=250, n_jobs=-1,
                   param_distributions={'classifier__bootstrap': [True, False],
                                        'classifier__criterion': ['gini',
                                                                  'entropy'],
                                        'classifier__max_depth': [10, 20, 30,
                                                                  40, 50, 60,
                                                                  70, 80, 90,
                                                                  100, None],
                                        'classifier__max_features': [None,
                                                                     'sqrt',
                                                                     'log2'],
                                        'classifier__min_samples_leaf': [1, 2,
                                                                         4],
                                        'classifier__min_samples_split': [2, 5,
                                                                          10],
                                        'classifier__n_estimators': [100, 200,
                                                                     300, 400,
                                                                     500, 600,
                                                                     700, 800,
                                                                     900, 1000,
                                                                     1100]},
                   random_state=42, verbose=2), [['Pclass', 'Age', 'Parch', 'Fare'], ['Sex', 'Embarked']], [{'classifier__n_estimators': 600, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': 10, 'classifier__criterion': 'gini', 'classifier__bootstrap': False}, 0.8258839751797499, Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['Pclass', 'Age', 'Parch',
                                                   'Fare']),
                                                 ('onehot', OneHotEncoder(),
                                                  ['Sex', 'Embarked'])])),
                ('classifier',
                 RandomForestClassifier(bootstrap=False, max_depth=10,
                                        max_features='log2',
                                        min_samples_split=10, n_estimators=600,
                                        random_state=42))])]]
Highest = 0
Iteration = 1
"""

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[0.7039106145251397, RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('scaler',
                                                                               StandardScaler(),
                                                                               ['Pclass'])])),
                                             ('classifier',
                                              RandomForestClassifier(random_state=42))]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'classifier__bootstrap': [True, False],
                                        'classifier__criterion': ['gini',
                                                                  'entropy'],
                                        'classifier__max_depth': <scipy....
                                        'classif

In [None]:
# type(sampler.var_combinations[3][1])

list

In [None]:
# models = sampler.sampleLogR()

In [None]:
# top_performers = models[0:5]

In [None]:
# test_set = pd.read_csv('test.csv')
# test_set = test_set.fillna({
#     'Pclass': test_set['Pclass'].mean(),
#     'Age': test_set['Age'].mean(),
#     'Parch': test_set['Parch'].mean(),
#     'Fare': test_set['Fare'].mean(),
#     'SibSp': test_set['SibSp'].mean()
# })

In [None]:
# predictions = []
# for logr in top_performers:
#     model = logr[1].fit(train_set[logr[2][0]+logr[2][1]], train_set['Survived'])
#     predictions.append((model.predict_proba(test_set[logr[2][0]+logr[2][1]])[:, 1] >= logr[3]).astype(int))

In [None]:
# prediction_dfs = []
# for y in predictions:
#     df = pd.DataFrame({'PassengerId': test_set['PassengerId'], 'Survived': list(y)})
#     prediction_dfs.append(df)

In [None]:
# prediction_dfs[0].to_csv('submission_df0_logr.csv', index=False) # .75119
# prediction_dfs[1].to_csv('submission_df1_logr.csv', index=False) # .74880
# prediction_dfs[2].to_csv('submission_df2_logr.csv', index=False) # .74641
# prediction_dfs[3].to_csv('submission_df3_logr.csv', index=False) # .74641
# prediction_dfs[4].to_csv('submission_df4_logr.csv', index=False) # .74401

In [None]:
# aggregate_prediction = []
# for idx in range(len(predictions[0])):
#     ct = {0:0, 1:1}
#     for prediction in prediction_dfs:
#         if prediction['Survived'].iloc[idx] == 1:
#             ct[1] += 1
#         else:
#             ct[0] += 1
#     if ct[0] > ct[1]:
#         aggregate_prediction.append(0)
#     else:
#         aggregate_prediction.append(1)
# aggregate_prediction

[0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,


In [None]:
# agg_df = pd.DataFrame({'PassengerId': test_set['PassengerId'], 'Survived': aggregate_prediction})
# agg_df.to_csv('submission_agg_0-5_logr.csv', index=False)