In [39]:
import pandas as pd
import csv
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

def age_model_producer(dataset):
    X = dataset.dropna()
    y = X["Age"]
    x = X.drop(["Age"] , axis = 1 )
    
    
    model = MLPRegressor(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.05, hidden_layer_sizes = (15, 10), max_iter = 1500)
    model.fit(x , y)
    return model
    
def handle_missing_numerical_mean(dataset, column_name):
    
    series = dataset[column_name]
    temp_series = series.dropna()
    temp_array = np.array(temp_series)
    
    mean = temp_array.mean()
    temp_series = series.fillna(mean)
    return (temp_series)

def handle_missing_age(dataset, method = "mean"):
    
    if method == "mean":
        return handle_missing_numerical_mean(dataset, "Age")
    else:
        model = age_model_producer(dataset)
        
        for i in range(dataset.shape[0]):
            if np.isnan(dataset.loc[[i]]["Age"]).values[0]:
                dataset.loc[[i]] = dataset.loc[[i]].fillna(value = model.predict(dataset.loc[[i]].drop(["Age"], axis = 1))[0])
                
        return dataset["Age"]

dataset = pd.read_csv("train.csv")

X = dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

dummies_sex = pd.get_dummies(X["Sex"])
X["Pclass"] = X["Pclass"].replace(to_replace = [1, 2, 3], value = ["1_pclass", "2_pclass", "3_pclass" ])
dummies_pclass = pd.get_dummies(X["Pclass"])
dummies_embarked = pd.get_dummies(X["Embarked"])


nl = MinMaxScaler()
X[[ "Fare", "SibSp", "Parch"]] = nl.fit_transform(X[[ "Fare" , "SibSp", "Parch"]])


x = X[["Age", "Fare", "SibSp", "Parch"]]
x = pd.concat([x, dummies_sex, dummies_pclass, dummies_embarked], axis = 1)
x["Age"] = handle_missing_age(x, method = "predict")
#x["Age"] = handle_missing_age(X)
x[["Age"]] = nl.fit_transform(x[[ "Age"]])


y = pd.get_dummies(X["Survived"])

In [40]:
y = y.to_numpy()[:,0]
X = x.to_numpy()

## RF with Grid Hyperparamter Search

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators':(100, 300, 500,1000), 'criterion':('gini', 'entropy'),'min_samples_split':[2, 11],'min_samples_leaf':[1, 9]}

gridSearch = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=10)
gridSearch.fit(X,y)

params = gridSearch.best_params_
score = gridSearch.best_score_

print(f"Best Parameter: {params}")
print(f"Best parameters score: {score}")

Best Parameter: {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 11, 'n_estimators': 300}
Best parameters score: 0.8317103620474408
