In [None]:
def show_cross_validate(modelo, X, Y, k):
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    cv_results = cross_validate(modelo, X, Y, cv=k, scoring=scoring)

    #Print Metrics Mean
    print('Accuracy: ', cv_results['test_accuracy'].mean(),
         '\nPrecision: ', cv_results['test_precision'].mean(),
         '\nRecall: ', cv_results['test_recall'].mean(),
         '\nF1: ', cv_results['test_f1'].mean())
    
def show_roc_curve(y_true, y_prob):
    fpr, tpr, threshold = roc_curve(y_true, y_prob)

    plt.figure(figsize=(10, 7))
    plt.title('ROC CURVE AUC')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0, 1], [0, 1], lw=2,linestyle='--')
    plt.show()

In [None]:
#!pip install plotly
#!conda install -c conda-forge jupyter_contrib_nbextensions
#!pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/xgboost-[version]+[commithash]-py2.py3-none-manylinux1_x86_64.whl

In [None]:
import pandas as pd
import statistics as sta
import plotly.graph_objs as go
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier
import numpy as np

In [None]:
train_csv_path = "df_titanic/train.csv"
df_train = pd.read_csv(train_csv_path)

In [None]:
pd.options.display.max_rows = None

In [None]:
df_train[df_train['Age'] == 21]

In [None]:
#Entendimento dos dados

# Tratamento de dados

In [None]:
df_train.groupby(['Sex', 'Pclass', 'Survived'],as_index = False).count()

In [None]:
df_vazio = df_train[df_train['Age'].isnull()]

In [None]:
df_vazio = df_vazio.groupby(['PassengerId', 'Sex'], as_index=False).count()

In [None]:
df_vazio = df_vazio.groupby(['Sex'])['Age'].count().reset_index(name='Quantidade de idade vazia')

In [None]:
sns.barplot(x='Sex', y='Quantidade de idade vazia', data=df_vazio).set_title('Quantidade de NAs nos registros')

# Tratando os Nulos

In [None]:
#Fill NaN values with the Miss' median age
df_train['Age'] = df_train.apply(lambda x : 21 if (('Miss.' in x['Name']) & (pd.isnull(x['Age']) == True)) else x['Age'], axis = 1)

In [None]:
#Fill NaN values with the Master's median age
df_train['Age'] = df_train.apply(lambda x : 3.5 if (('Master' in x['Name']) & (pd.isnull(x['Age']) == True)) else x['Age'], axis=1)

In [None]:
#Fill NaN values with the Mr's median age
df_train.Age = df_train.apply(lambda x: 30 if (('Mr.' in x['Name']) & (pd.isnull(x['Age']) == True ))else x['Age'], axis=1)

In [None]:
#Fill NaN values with the Mrs' median age
df_train.Age = df_train.apply(lambda x: 35 if (('Mrs.' in x['Name']) & (pd.isnull(x['Age']) == True)) else x['Age'], axis=1)

In [None]:
#Fill NaN values with the Dr's Median Age
df_train.Age = df_train.apply(lambda x: 46.5 if (('Dr.' in x['Name']) & (pd.isnull(x['Age']) == True)) else x['Age'], axis=1)

In [None]:
#df_train[(df_train['Name'].str.contains('Dr.', regex=False) & (df_train['Age'].isnull() == False))].Age.median()

In [None]:
#df_train[df_train.Age.isnull()].Age.count()

In [None]:
#df_train.loc[(df_train['Name'].str.contains('Miss', regex=False))]

In [None]:
#df_train[(df_train['Age'].isnull() == False) & (df_train['Name'].str.contains('Miss', regex=False))].median()

In [None]:
#df_train[(df_train['Age'].isnull() == False) & (df_train['Name'].str.contains('Miss', regex=False)  & (df_train['Age'] == 21))]

In [None]:
#df_train

In [None]:
df_full = df_train[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age']].copy()

In [None]:
df_full['Sex_d'] = df_full.Sex.map({'female':0, 'male':1})

In [None]:
df_full = df_full.drop(columns={'Sex'}).rename(columns={'Sex_d':'Sex_Male'})


In [None]:
df_full[df_full['Age'].isnull()]

# Data Understanding

In [None]:
df_age = df_train.groupby(['Sex', 'Pclass', 'Survived'], as_index = True)['PassengerId'].count()

In [None]:
df_age = df_age.rename(columns={'PassengerId': 'Count'}, inplace=True)

In [None]:
 df_age = df_age.reset_index(name='Count')

In [None]:
df_female = df_age[df_age['Sex'] == 'female']

In [None]:
df_female

In [None]:
df_male = df_age.reset_index()

In [None]:
sns.barplot(x = "Pclass", y = "Count", data = df_female, hue = "Survived").set_title('Sobrevivência de Mulheres')

In [None]:
sns.barplot(x = 'Pclass', y = 'Count', data = df_male, hue= 'Survived', errwidth=0).set_title('Sobrevivência de Homens')

# Faixa Etaria

In [None]:
binn = []
for x in range(0, 90, 10):
    binn.append(x)
binn

In [None]:
df_age = df_train.groupby(['Sex', 'Pclass', 'Survived', pd.cut(df_train['Age'], binn)], as_index = True)['Age'].count()

In [None]:
#Para resetar o indice e nomear a coluna count
df_age = df_age.reset_index(name='count')

In [None]:
df_age_f = df_age[df_age['Sex'] == 'female']

In [None]:
df_age_f.head()

In [None]:
f, ax = plt.subplots(figsize=(10,7))
sns.barplot(x='Age', y='count', data=df_age_f, hue='Survived', errwidth=0).set_title('Sobrevivência por idade mulheres')
plt.ylabel('Quantidade de mortes/sobrviventes')
plt.show()

In [None]:
df_age_m = df_age[df_age['Sex'] == 'male']

In [None]:
f, ax = plt.subplots(figsize=(10, 4))
sns.barplot(x='Age', y='count', data=df_age_m, hue='Survived', errwidth=0).set_title('Sobrevivência por idade homens')
plt.ylabel('Quantidade de mortes/sobrviventes')

plt.show()

In [None]:
f, ax = plt.subplots(figsize=(10, 4))
sns.barplot(x='Age', y='count', data=df_age_m, hue='Survived', errwidth=1).set_title('Sobrevivência por idade homens')
plt.ylabel('Quantidade de mortes/sobrviventes')

plt.show()

In [None]:
#Ver probabilidade das crianças sobreviverem,
#se somente recem nascido tem mais chance de sobreviver, 
#caso entrar idades nulas olhar Pclass para ver se encontra alguma coorelação
#df_name.head()

# Análise Diagnóstica

In [None]:
df_child = df_full.copy()
df_child['LessThanFive'] = df_full.apply(lambda x : 1 if (x['Age'] < 5) else 0, axis = 1)

In [None]:
df_child[df_child['Age'] < 5].head()

In [None]:
dict_tmp = {}
#Classe e Survived
df_tmp = df_child.copy()
df_tmp.drop(df_tmp[df_tmp.Age>12].index,inplace=True)
print('menores de 5:',
    '\nClasse 1 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==1) & (df_tmp.LessThanFive==1)].shape[0]),
    '\nClasse 2 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==2) & (df_tmp.LessThanFive==1)].shape[0]),
    '\nClasse 3 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==3) & (df_tmp.LessThanFive==1)].shape[0]))
print('\n')
print('maiores de 5:',
    '\nClasse 1 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==1) & (df_tmp.LessThanFive==0)].shape[0]),
    '\nClasse 2 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==2) & (df_tmp.LessThanFive==0)].shape[0]),
    '\nClasse 3 mortos:',str(df_tmp[(df_tmp.Survived==0) & (df_tmp.Pclass==3) & (df_tmp.LessThanFive==0)].shape[0]))

In [None]:
df_tmp.columns

In [None]:
#Mulheres sobreviventes e não sobreviventes
mask = (df_tmp.Sex_Male==0) & (df_tmp.LessThanFive==0)
print(
    'Meninas mais velhas mortas:',str(df_tmp[(df_tmp.Survived==0) & mask].shape[0]),
    '\nMeninas mais velhas sobreviventes:',str(df_tmp[(df_tmp.Survived==1) & mask].shape[0]))
#Mulheres sobreviventes e não sobreviventes
mask = (df_tmp.Sex_Male==0) & (df_tmp.LessThanFive==1)
print(
    'Meninas mais novas mortas:',str(df_tmp[(df_tmp.Survived==0) & mask].shape[0]),
    '\nMeninas mais novas sobreviventes:',str(df_tmp[(df_tmp.Survived==1) & mask].shape[0]))

In [None]:
dict_tmp = {'Age': ['0-5', '6-12']}


In [None]:
#Mulheres sobreviventes e não sobreviventes
#mulheres sobreviventes

mask = (df_tmp.Survived == 1) & (df_tmp.Sex_Male == 0)
dict_tmp.update({'Meninas Sobreviventes': [df_tmp[(mask) & (df_tmp.LessThanFive == 1)].shape[0], 
                                       df_tmp[(mask) & (df_tmp.LessThanFive == 0)].shape[0]]})
#mulheres nao sobreviventes
mask = (df_tmp.Survived == 0) & (df_tmp.Sex_Male == 0)
dict_tmp.update({'Meninas nao Sobreviventes': [df_tmp[(mask) & (df_tmp.LessThanFive == 1)].shape[0],
                                           df_tmp[(mask) & (df_tmp.LessThanFive == 0)].shape[0]]})


In [None]:
#Homens sobreviventes e não sobreviventes
mask = (df_tmp.Sex_Male==1) & (df_tmp.LessThanFive==0)
print(
    'Meninos mais velhos mortos:',str(df_tmp[(df_tmp.Survived==0) & mask].shape[0]),
    '\nMeninos mais velhos sobreviventes:',str(df_tmp[(df_tmp.Survived==1) & mask].shape[0]))
mask = (df_tmp.Sex_Male==1) & (df_tmp.LessThanFive==1)
print(
    'Meninos mais novos mortos:',str(df_tmp[(df_tmp.Survived==0) & mask].shape[0]),
    '\nMeninos mais novos sobreviventes:',str(df_tmp[(df_tmp.Survived==1) & mask].shape[0]))

In [None]:
#Meninos sobreviventes e não sobreviventes
#Meninos sobreviventes
mask = (df_tmp.Survived == 1) & (df_tmp.Sex_Male == 1)
dict_tmp.update({'Meninos Sobreviventes': [df_tmp[(mask) & (df_tmp.LessThanFive == 1)].shape[0], 
                                       df_tmp[(mask) & (df_tmp.LessThanFive == 0)].shape[0]]})
#mulheres nao sobreviventes
mask = (df_tmp.Survived == 0) & (df_tmp.Sex_Male == 1)
dict_tmp.update({'Meninos nao Sobreviventes': [df_tmp[(mask) & (df_tmp.LessThanFive == 1)].shape[0],
                                           df_tmp[(mask) & (df_tmp.LessThanFive == 0)].shape[0]]})


In [None]:
#sobreviventes e não sobreviventes
dict_tmp.update({'Total de sobreviventes':[df_tmp[(df_tmp.Survived == 1) & (df_tmp.LessThanFive == 1)].shape[0],
                                      df_tmp[(df_tmp.Survived == 1) & (df_tmp.LessThanFive == 0)].shape[0]]})

dict_tmp.update({'Total de não sobreviventes':[df_tmp[(df_tmp.Survived == 0) & (df_tmp.LessThanFive == 1)].shape[0],
                                      df_tmp[(df_tmp.Survived == 0) & (df_tmp.LessThanFive == 0)].shape[0]]})


In [None]:
#Porcentagem dos sobreviventes Menores que 5 anos
dict_tmp.update({'% sobreviventes':[(df_tmp[(df_tmp.LessThanFive == 1) & (df_tmp.Survived == 1)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0]) * 100,
                                                 (df_tmp[(df_tmp.LessThanFive == 0) & (df_tmp.Survived == 1)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0]) * 100]})




In [None]:
#Homens Sobreviventes
mask = (df_tmp.Sex_Male==1) & (df_tmp.LessThanFive==0)
print(
    'Meninos não sobreviventes:',str(df_tmp[(df_tmp.Survived==0) & mask].shape[0]),
    '\nMeninas mais velhas sobreviventes:',str(df_tmp[(df_tmp.Survived==1) & mask].shape[0]))

In [None]:
#Homens não sobreviventes
mask2 = (df_full.Survived==0) & (df_full.Sex_Male == 1)
df_full[mask2].shape[0]

In [None]:
#Porcentagem dos Sobrviventes Menores que 5 anos pertencentes à classe 1
dict_tmp.update({'% Sobreviventes classe 1: ':[(df_tmp[(df_tmp.LessThanFive==1) & (df_tmp.Survived==1) & (df_tmp.Pclass == 1)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0]) * 100,
                                                                           (df_tmp[(df_tmp.LessThanFive==0) & (df_tmp.Survived==1) & (df_tmp.Pclass == 1)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0]) * 100]})



In [None]:
dict_tmp.update({'% Sobreviventes Classe 2:':[(df_tmp[(df_tmp.LessThanFive==1) & (df_tmp.Survived==1) & (df_tmp.Pclass == 2)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0])*100,
                                                                           (df_tmp[(df_tmp.LessThanFive==0) & (df_tmp.Survived==1) & (df_tmp.Pclass == 2)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0])*100]})


In [None]:
dict_tmp.update({'% Sobreviventes Classe 3:':[(df_tmp[(df_tmp.LessThanFive==1) & (df_tmp.Survived==1) & (df_tmp.Pclass == 3)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0])*100,
                                                                           (df_tmp[(df_tmp.LessThanFive==0) & (df_tmp.Survived==1) & (df_tmp.Pclass == 3)].shape[0] / df_tmp[df_tmp.Survived == 1].shape[0])*100]})


In [None]:
df_child = pd.DataFrame(dict_tmp)

In [None]:
df_child = df_child.set_index('Age')

In [None]:
df_teste = df_child.reset_index()

In [None]:
pl3 = plt.bar(df_teste['Age'], df_teste['Meninos nao Sobreviventes'])
pl4 = plt.bar(df_teste['Age'], df_teste['Meninas nao Sobreviventes'], bottom = df_teste['Meninos nao Sobreviventes'])
plt.legend((pl3[0], pl4[0]), ('Meninos nao Sobreviventes', 'Meninas nao Sobreviventes'))
plt.show()

In [None]:
pl1 = plt.bar(df_teste['Age'], df_teste['Meninos Sobreviventes'])
pl2 = plt.bar(df_teste['Age'], df_teste['Meninas Sobreviventes'], bottom = df_teste['Meninos Sobreviventes'])
plt.legend((pl1[0], pl2[0]), ('Meninos Sobreviventes', 'Meninas Sobreviventes'))
plt.show()
pl3 = plt.bar(df_teste['Age'], df_teste['Meninos nao Sobreviventes'])
pl4 = plt.bar(df_teste['Age'], df_teste['Meninas nao Sobreviventes'], bottom = df_teste['Meninos nao Sobreviventes'])
plt.legend((pl3[0], pl4[0]), ('Meninos nao Sobreviventes', 'Meninas nao Sobreviventes'))
plt.show()

In [None]:
df_child.head()

In [None]:
df_child[['Meninas Sobreviventes', 'Meninos Sobreviventes', 'Total de sobreviventes']].plot(kind='bar', title='Sobreviventes')
df_child[['Meninas nao Sobreviventes', 'Meninos nao Sobreviventes', 'Total de não sobreviventes']].plot(kind='bar', title='Nao sobreviventes')

In [None]:
df_full

In [None]:
sns.set(context='notebook', style='darkgrid')
sns.pairplot(df_full[['Age', 'Pclass', 'Sex_Male']], height=2.5)
plt.show()

# Correlation Grapich

In [None]:
correlation = df_full.corr()
correlation['Age'].sort_values(ascending = False).head(15)

f, ax = plt.subplots(figsize = (8, 6))
sns.heatmap(correlation)
plt.show()

# Machine Learning

In [None]:
from sklearn.model_selection import train_test_split

# Regressão Linear Logistica

In [None]:
df_train.head()

In [None]:
df_full.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_full[['Pclass', 'Sex_Male', 'Age']], df_full.Survived, train_size = 0.7, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
yhat = model.predict(X_test)

In [None]:
y_hat_prob = model.predict_proba(X_test)[:, 1]

In [None]:
plt.title('Histograma das probabilidades preditas')
plt.xlim(0, 1)
plt.hist(y_hat_prob, bins=10)

# Metrics

In [None]:
yhat

In [None]:
#y_test['Pred'] = yhat

In [None]:
#y_test.pop('Pred')

In [None]:
#y_test = pd.DataFrame(y_test)

In [None]:
#y_test['Pred'] = yhat

In [None]:
#y_test

In [None]:
#Accuracy Metric. It's the proportion of true values among of the total numbers of the classes examined.
ac_logistic = accuracy_score(y_test, yhat)
ac_logistic

In [None]:
#Precision Metric. It's the proportion of true values that are really true
precision_score(y_test, yhat)

In [None]:
#Recall_score. It's how many true positives were recalled (found) and how many of the correct hits were also found 
recall_score(y_test, yhat)

In [None]:
#F1_Score Metric. It's a number between 0 and 1. It's a hamonic mean of Precision and Recall Metrics
f1_score(y_test, yhat)

In [None]:
#Criando a comparacao

In [None]:
yhat2 = pd.DataFrame(yhat).rename(columns = {0:'Predito'})

In [None]:
yhat2 = yhat2.set_index(y_test.index)

In [None]:
y_test2 = pd.DataFrame(y_test)

In [None]:
y_test2['Predito'] = yhat2['Predito']

In [None]:
#pd.options.display.max_rows = None

In [None]:
y_test2

# Confusion Matrix - Logistic Regression

In [None]:
fig = plt.figure(figsize=(7,5))
cm = confusion_matrix(y_test, yhat)
kws = {"ha": 'left',"va": 'top', "size":15}
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]

labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]

labels = np.asarray(labels).reshape(2,2)
heatmap = sns.heatmap(cm/np.sum(cm), annot=labels, annot_kws=kws, fmt='', cmap='Blues').set_title('Confusion Matrix')
plt.tick_params(labelsize=15)
plt.plot()

# ROC CURVE AUC - Logistic Regression

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_hat_prob)

In [None]:
plt.figure(figsize=(10, 7))
plt.title('ROC CURVE AUC')
plt.xlabel('False Positives Rate')
plt.ylabel('True Positives Rate')
plt.plot(fpr, tpr, lw=2)
plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
plt.show()

# Árvore de decisão

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
df_full[['Pclass', 'Age', 'Sex_Male']].head()

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived, train_size=0.75, random_state=1)

In [None]:
model_tree = DecisionTreeClassifier()

In [None]:
model_tree.fit(X_train2, y_train2)

In [None]:
df_full

In [None]:
yhat = model_tree.predict(X_test2)

In [None]:
y_hat_prob = model_tree.predict_proba(X_test2)[:, 1]

In [None]:
plt.hist(y_hat_prob, bins=10)
plt.xlim(0, 1)
plt.title('Histogram de probabilidade das prdicoes')
plt.show()

# Metrics

In [None]:
#Accuracy Metric. It's the proportion of true values among of the total numbers of the classes examined.
ac_tree = accuracy_score(y_test2, yhat)
ac_tree

In [None]:
#Precision Metric. It's the proportion of true values that are really true
precision_score(y_test2, yhat)

In [None]:
#Recall_score. It's how many true positives were recalled (found) and how many of the correct hits were also found 
recall_score(y_test2, yhat)

In [None]:
#F1_Score Metric. It's a number between 0 and 1. It's a hamonic mean of Precision and Recall Metrics
f1_score(y_test2, yhat)

# Confusion Matrix - Tree Decision

In [None]:
fig = plt.figure(figsize=(7,5))
cm = confusion_matrix(y_test2, yhat)
kws = {"ha": 'left',"va": 'top', "size":15}
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]

labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]

labels = np.asarray(labels).reshape(2,2)
heatmap = sns.heatmap(cm/np.sum(cm), annot=labels, annot_kws=kws, fmt='', cmap='Blues').set_title('Confusion Matrix')
plt.tick_params(labelsize=15)
plt.plot()

# ROC Curve AUC - Decision Tree

In [None]:
fpr, tpr, threshold = roc_curve(y_test2, y_hat_prob)

In [None]:
plt.figure(figsize=(10, 7))
plt.title('ROC CURVE AUC')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot(fpr,tpr)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.show()

In [None]:
#pd.options.display.max_rows = 60

In [None]:
#df_full.head()

In [None]:
yhat = pd.DataFrame(yhat).rename(columns = {0:'Predito'})

In [None]:
yhat = yhat.set_index(y_test2.index)

In [None]:
y_test2 = pd.DataFrame(y_test2)
y_test2['Predito'] = yhat['Predito']

In [None]:
#That's a comparation between the True values and the Predicted values
y_test2

# Random Forest Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived, train_size=0.7, random_state=1)

In [None]:
model_rfc = RandomForestClassifier()

In [None]:
#model.fit(X_train, y_train)
model_rfc.fit(df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived)

In [None]:
yhat = model_rfc.predict(df_full[['Pclass', 'Age', 'Sex_Male']])
yhat

In [None]:
y_hat_prob = model_rfc.predict_proba(df_full[['Pclass', 'Age', 'Sex_Male']])[:,1]

In [None]:
plt.hist(y_hat_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histograma das possibilidades preditas')
plt.show()

# Metrics

In [None]:
#Cross Validate Metric
show_cross_validate(model, df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived, k=5)

In [None]:
#ac_random = accuracy_score(y_test, yhat)
#ac_random

In [None]:
#precision_score(y_test, yhat)

In [None]:
#recall_score(y_test, yhat)

In [None]:
#f1_score(y_test, yhat)

# Confusion Matrix - Random Forest

In [None]:
fig = plt.figure(figsize=(7,5))
cm = confusion_matrix(df_full.Survived, yhat)

kws = {"ha": 'left',"va": 'top', "size":15}
group_names = ['True Neg','False Pos','False Neg','True Pos']

group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]

labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]

labels = np.asarray(labels).reshape(2,2)
heatmap = sns.heatmap(cm/np.sum(cm), annot=labels, annot_kws=kws, fmt='', cmap='Blues').set_title('Confusion Matrix')
plt.tick_params(labelsize=15)
plt.plot()

# ROC CURVE AUC - Random Forest

In [None]:
show_roc_curve(df_full.Survived, y_hat_prob)

# XGBoost

In [None]:
model = XGBClassifier()

In [None]:
df_full.columns

In [None]:
show_cross_validate(model, df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived, k=5)

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
model.fit(df_full[['Pclass', 'Age', 'Sex_Male']], df_full.Survived)

In [None]:
df_full[['Pclass', 'Age', 'Sex_Male']]

In [None]:
yhat = model.predict(df_full[['Pclass', 'Age', 'Sex_Male']])

In [None]:
y_hat_prob = model.predict_proba(df_full[['Pclass', 'Age', 'Sex_Male']])[:,1]

In [None]:
plt.title('Histograma das probabilidade de predicao')
plt.xlim(0, 1)
plt.hist(y_hat_prob, bins=10)
plt.show()

In [None]:
#ROCK CURVE AUC
show_roc_curve(df_full.Survived, y_hat_prob)

In [None]:
df_full.columns

In [None]:
yhat = pd.DataFrame(yhat).rename(columns={0:'Predito'})

In [None]:
yhat = yhat.set_index(df_full[['Pclass', 'Age', 'Sex_Male']].index)

In [None]:
yhat['True'] = df_full.Survived

In [None]:
yhat

# Final Result of the Machine Learning Algorithms

In [None]:
# Columns = [
#     'Algorithm',
#     'Percent of Accuracy'
# ]

# df_metrics = pd.DataFrame({"Algorithm": ['Logistic Regression', 'Tree Decision', 'Random Forest'], 
#                            "Percent of Accuracy": [ac_logistic * 100, ac_tree * 100, ac_random * 100]})
# df_metrics.head()

In [None]:
# df_metrics.sort_values(['Percent of Accuracy'], ascending=False, inplace=True)

In [None]:
# f, ax = plt.subplots(figsize=(7,4))

# total = len(df_metrics['Percent of Accuracy'])
# barplot = sns.barplot(x='Percent of Accuracy', y='Algorithm', data=df_metrics, color='darkblue')
# plt.title('Porcentagem de Accuracy por algoritimo')

# plt.show()

# Preparing to Commit at Kaggle

In [None]:
test_csv_path = "df_titanic/test.csv"
df_final = pd.read_csv(test_csv_path)
df_final.head()

In [None]:
df_final = df_final.drop(columns={'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Name'}).dropna()

In [None]:
df_final['Sex_Male'] = df_final.Sex.map({'female': 0, 'male': 1})

In [None]:
df_final.drop(columns={'Sex'}, inplace=True)

In [None]:
# pd.options.display.max_rows = 60

In [None]:
# df_testando = df_full.append(df_final[['PassengerId', 'Pclass', 'Age', 'Sex_Male']], sort=False)

In [None]:
# df_final = df_testando.reset_index().drop(columns={'index'})

In [None]:
df_final

In [None]:
yhat = model_rfc.predict(df_final[['Pclass', 'Age', 'Sex_Male']])
yhat

In [None]:
yhat = pd.DataFrame(yhat).rename(columns={0:'Predito'})

In [None]:
yhat = yhat.set_index(df_final[['Pclass', 'Age', 'Sex_Male']].index)

In [None]:
df_final['Predito'] = yhat
df_final

In [None]:
df_commit = df_final.drop(columns=['Pclass', 'Age', 'Sex_Male'])

In [None]:
df_commit = df_commit.rename(columns={"Predito":'Survived'})

In [None]:
df_commit.to_csv('gender_submission.csv')