In [None]:
import logging
from typing import Dict, Tuple
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

%config Completer.use_jedi = False

In [None]:
def get_score(name_metric: str, y_true: pd.Series, y_pred: (np.ndarray, np.array)) -> float:
    name_metrics = [i for i in dir(metrics) if callable(getattr(metrics, i)) and not i.startswith("__")]
    if name_metric not in name_metrics:
        return np.nan
    score = getattr(metrics, name_metric)(y_true, y_pred)
    return round(score, 2)


def split_data(df: pd.DataFrame, param: Dict) -> Tuple:
    X = df[param["features"]]
    y = df["Survived"]
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y,
                                                        test_size=param["test_size"],
                                                        random_state=param["random_state"])
    return X_train, X_test, y_train, y_test
    
    
def train_model(X_train: pd.DataFrame, y_train: pd.Series, param: Dict) -> RandomForestClassifier:
    rndm_forest = RandomForestClassifier(n_estimators=param["n_estimators"])
    rndm_forest.fit(X_train, y_train)
    return rndm_forest

In [None]:
df = pd.read_csv("../data/02_intermediate/preprocessed_input_data.csv")

In [None]:
param = {"test_size": 0.2,
         "random_state": 17,
         "n_estimators":100, 
         "metric": "accuracy_score",
         "features": ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]}

In [None]:
X_train, X_test, y_train, y_test = split_data(df, param)

In [None]:
mdl = train_model(X_train, y_train, param)

In [None]:
y_pred = mdl.predict(X_test)

In [None]:
get_score('accuracy_score', y_test, y_pred)