In [None]:
import os
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

In [None]:
sns.axes_style("darkgrid")
rcParams['figure.figsize'] = 12, 4

In [None]:
def collect_all_files(path, files):
    if os.path.isfile(path):
        files.add(path)
    else:
        for new_path in os.listdir(path):
            collect_all_files(os.path.join(path, new_path), files)

def all_files_with_text(path, text):
    files = set()
    collect_all_files(path, files)
    return list(filter(lambda filename : text in filename, files))

def read_csv(path, text):
    tables = all_files_with_text(path, text)
    df = pd.concat([pd.read_csv(file, low_memory=False, encoding='latin-1') for file in tables])
    df.columns = list(map(lambda column : column.lower(), df))
    df = df.rename(columns={'ï»¿tipodni' : 'tipodni'}).drop_duplicates().reset_index(drop=True)
    return df

def nan_len_and_proportion(df, column):
    len_col = len(df[df[column].isna()])
    return len_col, len_col/len(df)

def count_coulumns(df, columns):
    return df[columns].value_counts(subset=columns).reset_index().rename(columns={'count' : 'occurrences'})

def print_proportions(df):
    df_aux = pd.DataFrame()
    df_aux['reason'] = df.restrat.map(lambda value : 'Muere' if value == 5.0 else 'Vive')
    df_aux['prediction'] = df.probpim3.map(lambda value : 'Vive' if value < 0.5 else 'Muere')
    return count_coulumns(df_aux, ['reason', 'prediction'])

def check_duplicates(df, query):
    return df.query(query).reset_index()

def check_fivarapa_duplicates(df, dni):
    return check_duplicates(df, "tipodni == 'D.N.I.' and dni == '{dni}' and fecing == '01/01/2020'".format(dni=dni))

# EDA

### Read CSVs

In [None]:
df_fivarapa = read_csv('../resources/tables/SATIQ-PEDIATRICOS/', 'FIVARAPA') 
df_pim_2 = read_csv('../resources/tables/SATIQ-PEDIATRICOS/', 'FiPim')
df_pim_3 = read_csv('../resources/tables/SATIQ-PEDIATRICOS/', 'FiPIM3')

### FIVARAPA

In [None]:
df_fivarapa_occurrences = count_coulumns(df_fivarapa, ['tipodni', 'dni', 'fecing'])

In [None]:
df_fivarapa_occurrences[df_fivarapa_occurrences.occurrences > 1]

#### Remove duplicates so that tipodni, dni, fecing is key

In [None]:
duplicates_1 = check_fivarapa_duplicates(df_fivarapa, '1Q8QOC3m@aiI7h003005')
duplicates_2 = check_fivarapa_duplicates(df_fivarapa, 'fOIyvEV@iUpqUd000028')
duplicates_3 = check_fivarapa_duplicates(df_fivarapa, 'EcA`B8zh6BNX06001879')
duplicates_4 = check_fivarapa_duplicates(df_fivarapa, '2rXGIlg7Jgs9K5002598')
duplicates_5 = check_fivarapa_duplicates(df_fivarapa, 'gj7eh3wDvW`tX9000218')

In [None]:
duplicates_1

In [None]:
duplicates_2

In [None]:
duplicates_3

In [None]:
duplicates_4

In [None]:
duplicates_5

In [None]:
duplicates_1.isna().sum(axis=1)

In [None]:
duplicates_2.isna().sum(axis=1)

In [None]:
duplicates_3.isna().sum(axis=1)

In [None]:
duplicates_4.isna().sum(axis=1)

In [None]:
duplicates_5.isna().sum(axis=1)

In [None]:
df_fivarapa = df_fivarapa.drop([duplicates_1.iloc[1]['index'], duplicates_2.iloc[1]['index'], duplicates_3.iloc[1]['index'], duplicates_4.iloc[1]['index'], duplicates_5.iloc[1]['index']])

In [None]:
count_coulumns(df_fivarapa, ['tipodni', 'dni', 'fecing'])

In [None]:
df_fivarapa.edad.min(), df_fivarapa.edad.max()

In [None]:
c = df_fivarapa.edad
c.clip(lower=0, upper=130)

In [None]:
f = sns.displot(df_fivarapa[(df_fivarapa.edad >= 0) & (df_fivarapa.edad <= 216)].edad, aspect=2.5, )
f.set_axis_labels(x_var='Age', y_var='Count')

In [None]:
len(df_fivarapa[df_fivarapa.edad > 120])

In [None]:
f = sns.displot(df_fivarapa[(df_fivarapa.edad >= 216) & (df_fivarapa.edad <= 500)].edad, aspect=2.5, )
f.set_axis_labels(x_var='Age', y_var='Count')

In [None]:
df_fivarapa = df_fivarapa[(df_fivarapa.edad >= 0) & (df_fivarapa.edad <= 216)].reset_index(drop=True)

In [None]:
df_fivarapa

In [None]:
df_fivarapa.isna().any()

In [None]:
df_fivarapa.restrat.unique()

In [None]:
len(df_fivarapa[df_fivarapa.restrat == 12.0])

In [None]:
df_fivarapa = df_fivarapa[df_fivarapa.restrat != 12.0]

In [None]:
sum(df_fivarapa.traq.isna())

In [None]:
sum(df_fivarapa.sexo.isna())

In [None]:
sum(df_fivarapa.sf.isna())

In [None]:
df_fivarapa = df_fivarapa[df_fivarapa.traq.notna()]

In [None]:
df_fivarapa = df_fivarapa[df_fivarapa.sexo.notna()]

In [None]:
df_fivarapa = df_fivarapa[df_fivarapa.sf.notna()]

In [None]:
df_fivarapa = df_fivarapa.drop(columns=['traqfi', 'traqff', 'fecingh', 'fecegrh', 'resultadoegresoh', 'dependencia'], errors='ignore')

In [None]:
df_fivarapa = df_fivarapa.reset_index(drop=True)

In [None]:
len(df_fivarapa)

In [None]:
df_fivarapa.isna().any()

In [None]:
df_fivarapa.fecegr.min(), df_fivarapa.fecegr.max()

In [None]:
df_fivarapa.fecing.min(), df_fivarapa.fecing.max()

In [None]:
df_fivarapa.fecegr = df_fivarapa.fecegr.str.split(' ', expand=True)[0]
df_fivarapa.fecegr = pd.to_datetime(df_fivarapa.fecegr, dayfirst=True)

In [None]:
df_fivarapa = df_fivarapa[df_fivarapa.fecing != '23/01/1013']

In [None]:
df_fivarapa.fecing = df_fivarapa.fecing.str.split(' ', expand=True)[0]
df_fivarapa.fecing = pd.to_datetime(df_fivarapa.fecing, dayfirst=True)

In [None]:
df_fivarapa['duration'] = (df_fivarapa.fecegr - df_fivarapa.fecing).map(lambda value : value.value)

In [None]:
df_fivarapa

In [None]:
df_fivarapa.isna().any()

In [None]:
df_fivarapa.dtypes

### PIM 2 vs PIM 3

In [None]:
set(df_pim_2.columns) ^ set(df_pim_3.columns)

In [None]:
len(df_pim_3)

### PIM 3

In [None]:
df_pim_3

In [None]:
df_pim_3_occurrences = count_coulumns(df_pim_3, ['tipodni', 'dni', 'fecing'])

In [None]:
df_pim_3_occurrences

In [None]:
df_pim_3.isna().any()

In [None]:
sum(df_pim_3.admisionelectiva.isna())

In [None]:
df_pim_3 = df_pim_3[df_pim_3.admisionelectiva.notna()]

In [None]:
df_pim_3.bypass.unique()

In [None]:
df_pim_3 = df_pim_3.drop(columns=['bypass'])

In [None]:
df_pim_3 = df_pim_3.reset_index(drop=True)

In [None]:
df_pim_3 = df_pim_3[df_pim_3.probpim3 > 0.0]
df_pim_3 = df_pim_3[df_pim_3.presionarterial >= 30]
df_pim_3 = df_pim_3[~((df_pim_3.fio2 < 0.21) & (df_pim_3.fio2 > 0.0))].reset_index(drop=True)

In [None]:
df_pim_3

In [None]:
df_pim_3.isna().any()

In [None]:
len(df_pim_3[df_pim_3.fecing == '25/04/0201'])

In [None]:
df_pim_3 = df_pim_3[df_pim_3.fecing != '25/04/0201']

In [None]:
df_pim_3.fecing = df_pim_3.fecing.str.split(' ', expand=True)[0]
df_pim_3.fecing = pd.to_datetime(df_pim_3.fecing, dayfirst=True)

### MOTING

In [None]:
df_moting = read_csv('../resources/tables/SATIQ-PEDIATRICOS/', 'MotingP')

In [None]:
len(df_moting)

In [None]:
count = count_coulumns(df_moting, ['tipodni', 'dni', 'fecing'])

In [None]:
count = count[count.occurrences > 1]

In [None]:
df_moting = df_moting[~((df_moting.tipodni.isin(['D.N.I.'])) & (df_moting.dni.isin(list(count.dni))) & (df_moting.fecing.isin(list(count.fecing))))].reset_index(drop=True)

In [None]:
len(df_moting)

In [None]:
count_coulumns(df_moting, ['tipodni', 'dni', 'fecing'])

In [None]:
df_moting.pating = df_moting.pating.replace(to_replace='Otras', value='Otros')

In [None]:
len(df_moting)

In [None]:
pating_map = {
    "NeurolÃ³gico" : "Neurológico",
    "Convulsiones" : "Neurológico",
    "Coma" : "Neurológico",
    "DepresiÃ³n del sensorio" :  "Neurológico",
    "TÃ³rax" : "Externa",
    "Insuficiencia respiratoria" :  "Respiratorio",
    "Sepsis mÃ©dica" : "Otros", #
    "Politraumatismo con TEC" : "Externa",
    "Ortopedia" : "Externa",
    "Otros" : "Otros",
    "Abdomen" : "Postquirúrgico",
    "MonitorizaciÃ³n-Vigilancia" : "Otros",
    "Shock" : "Otros", #
    "PlÃ¡stica" : "Postquirúrgico",
    "NeurocirugÃ­a" : "Postquirúrgico",
    "Crisis hipertensiva" : "Cardiovascular",
    "Insuficiencia cardÃ­aca" : "Cardiovascular",
    "Respiratorio" : "Respiratorio",
    "PatologÃ­a infecciosa" : "Otros", #
    "Politraumatismo sin TEC" : "Externa",
    "Disnea" : "Respiratorio",
    "Accidente cerebro-vascular" : "Neurológico",
    "MÃ¡xilo facial" : "Postquirúrgico",
    "Inestabilidad hemodinÃ¡mica" : "Otros", #
    "Tranplante HepÃ¡tico" : "Postquirúrgico",
    "Insuficiencia renal" : "Médico",
    "Tranplante Pulmonar" : "Postquirúrgico",
    "Transplante de mÃ©dula Ã³sea" : "Oncológico",
    "Alteraciones metabÃ³licas" : "Otros",
    "Causa Externa" : "Externa",
    "Arritmias cardÃ­acas" : "Cardiovascular",
    "PostquirÃºrgico" : "Postquirúrgico",
    "UrologÃ­a" : "Postquirúrgico",
    "Paro cardio-respiratorio" : "Externa",
    "Sepsis quirÃºrgica" : "Otros", #
    "Politraumatismo" : "Externa",
    "Gineco-obstetricia" : "Postquirúrgico",
    "Tranplante CardÃ­aco" : "Postquirúrgico",
    "Hemorragia digestiva" : "Otros",
    "CirugÃ­a Cardiovascular" : "Postquirúrgico",
    "Sme. De lisis tumoral" : "Oncológico",
    "CardiolÃ³gico" : "Cardiovascular",
    "Insuficiencia hepÃ¡tica" : "Médico",
    "Vasculares" : "Otros",
    "Sepsis traumÃ¡tica" : "Otros",
    "otros" : "Otros",
    "Tranplante Hepato Renal" : "Postquirúrgico",
    "Pancreatitis" : "Médico",
    "MÃ©dico" : "Médico",
    "QuirÃºrgico" : "Postquirúrgico",
    "Oncologico" : "Oncológico",
}

In [None]:
# len(df_moting[df_moting.pating.isin(['Sepsis mÃ©dica', 'Shock', 'PatologÃ­a infecciosa', 'Inestabilidad hemodinÃ¡mica', 'Sepsis quirÃºrgica', 'Sepsis traumÃ¡tica'])])

In [None]:
df_moting.pating = df_moting.pating.map(lambda pating : pating_map[pating])

In [None]:
len(df_moting[df_moting.pating == 'Oncológico'])

In [None]:
df_moting

In [None]:
df_moting.fecing = df_moting.fecing.str.split(' ', expand=True)[0]
df_moting.fecing = pd.to_datetime(df_moting.fecing, dayfirst=True)

### Merge

In [None]:
df_merged = pd.merge(left=df_fivarapa, right=df_pim_3, on=['tipodni', 'dni' ,'fecing'], suffixes=['_fivarapa', '_pim_3']).reset_index(drop=True)

In [None]:
df_merged = pd.merge(left=df_merged, right=df_moting, on=['tipodni', 'dni' ,'fecing'], suffixes=['_fivarapa', '_moting']).reset_index(drop=True)

In [None]:
len(df_merged), len(df_fivarapa), len(df_pim_3)

In [None]:
df_merged_occurrences = count_coulumns(df_merged, ['tipodni', 'dni', 'fecing'])

In [None]:
df_merged_occurrences

In [None]:
df_merged['outcome'] = df_merged.restrat.map(lambda value : 'Muere' if value == 5.0 else 'Vive')
df_merged['prediction'] = df_merged.probpim3.map(lambda value : 'Vive' if value < 0.5 else 'Muere')

In [None]:
df_merged.outcome.unique()

In [None]:
len(df_merged)

In [None]:
df_merged.isna().any()

In [None]:
df_merged[['arm_fivarapa' , 'arm_pim_3']]

In [None]:
df_merged['arm'] = [(1 if arm_fiv == 'S' else 0) or int(arm_pim) for arm_fiv, arm_pim in zip(df_merged.arm_fivarapa, df_merged.arm_pim_3)]

In [None]:
df_merged.outcome.unique()

In [None]:
df_merged.procedencia = df_merged.procedencia.astype('int64').astype(str)
df_merged.reingreso = df_merged.reingreso.astype('int64')
df_merged.tipo = df_merged.tipo.astype('int64').astype(str)
df_merged.traq = df_merged.traq.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.traqi = df_merged.traqi.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.traqe = df_merged.traqe.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.sng = df_merged.sng.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.sngi = df_merged.sngi.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.snge = df_merged.snge.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.arm_fivarapa = df_merged.arm_fivarapa.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.sf = df_merged.sf.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.sfi = df_merged.sfi.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.sfe = df_merged.sfe.map(lambda value : 1 if value == 'S' else 0).astype('int64')
df_merged.admisionelectiva = df_merged.admisionelectiva.astype('int64')
df_merged.pupila = df_merged.pupila.astype('int64')
df_merged.recuperacion = df_merged.recuperacion.astype('int64').astype(str)

In [None]:
df_merged.outcome.unique()

In [None]:
df_merged.to_csv('clean_data.csv', index=False)

In [None]:
df_merged.dtypes

### Plots

In [None]:
len(df_merged.edad.unique())

In [None]:
def age_map(age):
    if 0 < age <= 11:
        return '1 to 11'
    elif 11 < age <= 59:
        return '12 to 59'
    elif 59 < age <= 119:
        return '60 to 119'
    else:
        return 'over 120'

In [None]:
df_merged['age_range'] = df_merged.edad.map(age_map)

In [None]:
plot_order = sorted(list(df_merged.age_range.unique()))

In [None]:
data = df_merged[['age_range', 'tipodni', 'dni', 'fecing']].groupby('age_range').count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Casos')
p.set_title('Proporción de casos por rango etario')

In [None]:
df_merged.outcome.unique()

In [None]:
data = df_merged[df_merged.outcome == 'Muere'][['age_range', 'tipodni', 'dni', 'fecing']].groupby('age_range').count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Defunciones')
p.set_title('Proporción de defunciones por rango etario')

In [None]:
df_merged.prediction.unique()

In [None]:
data = df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Vive')][['age_range', 'tipodni', 'dni', 'fecing']].groupby('age_range').count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Falsos negativos')
p.set_title('Proporción de falsos negativos por rango etario')

In [None]:
p = sns.barplot(x='age_range', y='tipodni', data=df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Muere')][['age_range', 'tipodni', 'dni', 'fecing']].groupby('age_range').count().reset_index(), order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Reales positivos')
p.set_title('Número de reales positivos por rango etario')

In [None]:
data = df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Vive')][['age_range', 'arm', 'tipodni', 'dni', 'fecing']].groupby(['age_range', 'arm']).count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', hue='arm', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Falsos negativos con y sin ARM')
p.set_title('Proporción de falsos negativos considerando ARM por rango etario')

In [None]:
data = df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Muere')][['age_range', 'arm', 'tipodni', 'dni', 'fecing']].groupby(['age_range', 'arm']).count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', hue='arm', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Reales positivos con y sin ARM')
p.set_title('Proporción de reales positivos considerando ARM por rango etario')

In [None]:
data = df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Vive')][['age_range', 'reingreso', 'tipodni', 'dni', 'fecing']].groupby(['age_range', 'reingreso']).count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', hue='reingreso', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Falsos negativos con y sin reingresos')
p.set_title('Proporción de Falsos negativos considerando reingresos por rango etario')

In [None]:
data = df_merged[(df_merged.outcome == 'Muere') & (df_merged.prediction == 'Muere')][['age_range', 'reingreso', 'tipodni', 'dni', 'fecing']].groupby(['age_range', 'reingreso']).count().reset_index()
data['proportions'] = data.tipodni/data.tipodni.sum()
p = sns.barplot(x='age_range', hue='reingreso', y='proportions', data=data, order=plot_order)
p.set_xlabel('Rango etario')
p.set_ylabel('Reales positivos con y sin reingresos')
p.set_title('Proporción de reales positivos considerando reingresos por rango etario')

In [None]:
len(df_merged[df_merged.outcome == 'Muere'])

### Tables

In [None]:
merged_occurrences = count_coulumns(df_merged, ['outcome', 'prediction'])
merged_occurrences['proportions'] =  merged_occurrences.occurrences / merged_occurrences.occurrences.sum()
merged_occurrences[['outcome', 'prediction', 'proportions']]

In [None]:
merged_occurrences = count_coulumns(df_merged[df_merged.outcome == 'Muere'], ['outcome', 'prediction'])
merged_occurrences['proportions'] =  merged_occurrences.occurrences / merged_occurrences.occurrences.sum()
merged_occurrences[['outcome', 'prediction', 'proportions']]

In [None]:
merged_occurrences = count_coulumns(df_merged[df_merged.outcome == 'Muere'], ['outcome', 'age_range', 'prediction'])
merged_occurrences['proportions'] =  merged_occurrences.occurrences / merged_occurrences.occurrences.sum()
merged_occurrences[['outcome', 'age_range', 'prediction', 'proportions']]

In [None]:
df_table_1 = df_merged[(df_merged.outcome == 'Muere')][['sexo', 'age_range', 'arm', 'probpim3', 'pating', 'dias']]

In [None]:
df_table_1[['sexo', 'probpim3', 'dias']].groupby(['sexo']).median().reset_index()

In [None]:
df_age_rage_counts = pd.DataFrame(df_table_1[['age_range']].value_counts()).reset_index()
df_age_rage_counts['proportions'] = df_age_rage_counts['count'] / df_age_rage_counts['count'].sum()
df_age_rage_counts

In [None]:
df_arm_counts = pd.DataFrame(df_table_1[['arm']].value_counts()).reset_index()
df_arm_counts['proportions'] = df_arm_counts['count'] / df_arm_counts['count'].sum()
df_arm_counts

In [None]:
df_pating_counts = pd.DataFrame(df_table_1[['pating']].value_counts()).reset_index()
df_pating_counts['proportions'] = df_pating_counts['count'] / df_pating_counts['count'].sum()
df_pating_counts

In [None]:
df_sex_counts = pd.DataFrame(df_table_1[['sexo']].value_counts()).reset_index()
df_sex_counts['proportions'] = df_sex_counts['count'] / df_sex_counts['count'].sum()
df_sex_counts

In [None]:
df_merged.to_csv('clean_data.csv', index=False)

### Models

#### Random Forest

In [None]:
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
h2o.init()

In [None]:
df_merged.outcome = df_merged.outcome.map(lambda value : 1 if value == 'Muere' else 0).astype('int64')
df_merged.prediction = df_merged.prediction.map(lambda value : 1 if value == 'Muere' else 0).astype('int64')

In [None]:
df_merged.outcome.value_counts()

In [None]:
lives_df = df_merged[df_merged.outcome == 0].reset_index(drop=True)
dies_df = df_merged[df_merged.outcome == 1].reset_index(drop=True)

In [None]:
df_sampled = pd.concat([lives_df.sample(5000), dies_df.sample(5000, replace=True)]).reset_index(drop=True)

In [None]:
df_sampled.to_csv('df_sampled.csv', index=True)

In [None]:
h2o_df = h2o.import_file('df_sampled.csv')

In [None]:
h2o_df['outcome'] = h2o_df['outcome'].asfactor()

In [None]:
predictors = h2o_df.columns
predictors.remove('prediction')
predictors.remove('dni')
predictors.remove('tipodni')
predictors.remove('fecing')
predictors.remove('fechaing')
predictors.remove('C1')
predictors.remove('arm_fivarapa')
predictors.remove('arm_pim_3')
predictors.remove('scorepim3')
predictors.remove('probpim3')
predictors.remove('restrat')

In [None]:
train, valid = h2o_df.split_frame(ratios=[.8], seed=1)

In [None]:
import numpy as np

In [None]:
optimal_split = int(np.sqrt(len(predictors)))
number_of_predictors = len(predictors)

In [None]:
ntrees = int(np.math.factorial(number_of_predictors)/(np.math.factorial(optimal_split)*np.math.factorial(number_of_predictors-optimal_split)))
ntrees

In [None]:
random_forest = H2ORandomForestEstimator(
    ntrees=5000,
    max_depth=5,
    min_rows=10,
    calibrate_model=True,
    calibration_frame=valid,
    binomial_double_trees=True,
    balance_classes = True
)

In [None]:
random_forest.train(
    x=predictors,
    y='outcome',
    training_frame=train,
    validation_frame=valid
)

In [None]:
perf = random_forest.model_performance()

In [None]:
pred = random_forest.predict(valid)

In [None]:
y_pred = pred.as_data_frame()['predict']

In [None]:
y_pred

In [None]:
y_valid = valid.as_data_frame()['outcome']

In [None]:
y_valid

In [None]:
accuracy_score(y_pred, y_valid)

In [None]:
recall_score(y_pred, y_valid)

In [None]:
f1_score(y_pred, y_valid)

In [None]:
precision_score(y_pred, y_valid)

In [None]:
df_predictions = pd.DataFrame(data=np.array([y_pred, y_valid]).T, columns=['prediction', 'outcome'])

In [None]:
count_coulumns(df_predictions, ['prediction', 'outcome'])