**Score to beat: 0.79425**

# Import statements

In [21]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [22]:
pd.set_option('display.max_columns', None)

# Load data

In [23]:
train = pd.read_csv('data/preproc_data/train.csv')
test = pd.read_csv('data/preproc_data/test.csv')

## Check Sizes

In [24]:
train.shape, test.shape

((881, 14), (418, 13))

In [25]:
X_train = train.drop(columns='Survived')
y_train = pd.DataFrame(train['Survived'])

X_train.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,Single,SmallF,MedF,LargeF
0,0,1,3,1,22.0,1.981001,0,0,2,0,1,0,0
1,1,2,1,0,38.0,4.266662,3,1,1,0,1,0,0
2,2,3,3,0,26.0,2.070022,0,0,1,1,0,0,0
3,3,4,1,0,35.0,3.972177,3,0,1,0,1,0,0
4,4,5,3,1,35.0,2.085672,0,0,2,1,0,0,0


In [26]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50, 60],
    'min_samples_split': [2, 3, 5, 7, 10],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'oob_score': [True, False],
    'max_samples': [None, 0.5, 0.75]
}

In [27]:
y_train = y_train.values.ravel()

In [28]:
rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

Fitting 3 folds for each of 12600 candidates, totalling 37800 fits


[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400, oob_score=True; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400, oob_score=False; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400, oob_score=False; total time=   0.0s
[CV] END max_depth=None, max_features=auto, max_samples=None, 

12600 fits failed out of a total of 37800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6851 fits failed with the following error:
Traceback (most recent call last):
  File "/root/.pyenv/versions/3.9.0/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/.pyenv/versions/3.9.0/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/root/.pyenv/versions/3.9.0/lib/python3.9/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/root/.pyenv/versions/3.9.0/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 95, 

In [29]:
best_rf_clf = grid_search.best_estimator_

# Get feature importances
importances = best_rf_clf.feature_importances_

# Get feature names
feature_names = X_train.columns  # Assuming X is your DataFrame containing features

# Create a DataFrame to display feature importances
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort features by importance (descending order)
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Display the top 10 most important features (adjust as needed)
print("Top 10 Most Important Features:")
print(feature_importances.head(10))

Top 10 Most Important Features:
        Feature  Importance
8         Title    0.233125
3           Sex    0.151802
5          Fare    0.130207
4           Age    0.090300
1   PassengerId    0.084985
2        Pclass    0.078365
0    Unnamed: 0    0.078069
6         Cabin    0.061002
12       LargeF    0.034767
7      Embarked    0.020598


In [30]:
predictions = best_rf_clf.predict(test)


In [33]:
predictions_int = predictions.astype(int)
predictions_int

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [35]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions_int
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submissions/submission_2_0_model_1.csv', index=False)
display("Submission file generated successfully.")

'Submission file generated successfully.'