In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("Dados/train.csv", sep=",")
test = pd.read_csv("Dados/test.csv", sep=",")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)

In [5]:
train['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [6]:
#Esse modelo apenas trabalha com dados numéricos então é necessário trocar valores da coluna Sex. obs: no train e test
def transformar_sexo(valor):
    if valor == 'female':
        return 1
    else:
        return 0
#map pega a função entre parenteses e aplica linha a linha na coluna selecionada.
train['Sex_binario'] = train['Sex'].map(transformar_sexo)

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binario
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [8]:
variaveis = ['Sex_binario', 'Age']

In [9]:
X = train[variaveis]
y = train['Survived']

In [10]:
X.head()

Unnamed: 0,Sex_binario,Age
0,0,22.0
1,1,38.0
2,1,26.0
3,1,35.0
4,0,35.0


In [11]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [12]:
#normalmente modelos de MLs não vão funcionar com valores NaN ou valores muito grandes(ex float)
#fillna vai substituir todos os valores vazios por -1(no exemplo) obs: no train e test
#-1 pq tanto nas duas colunas usadas ele será interpretado como um valor fora do comum
X = X.fillna(-1)

In [13]:
modelo.fit(X, y)

In [14]:
test['Sex_binario'] = test['Sex'].map(transformar_sexo)

In [15]:
X_prev = test[variaveis]
X_prev = X_prev.fillna(-1)
X_prev.head()

Unnamed: 0,Sex_binario,Age
0,0,34.5
1,1,47.0
2,0,62.0
3,0,27.0
4,1,22.0


In [16]:
p = modelo.predict(X_prev)
p

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [17]:
sub = pd.Series(p, index=test['PassengerId'], name='Survived')
sub.shape

(418,)

In [18]:
#Fazendo o arquivo
sub.to_csv('primeiro_modelo.csv', header= True)

In [None]:
# Criando ambiente para avaliar o modelo
# treino, validação e teste

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_falso = np.arange(10)
X_falso

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
#gera separa os numeros da lista para treino e teste
train_test_split(X_falso, test_size=0.5)

[array([5, 2, 1, 0, 8]), array([3, 9, 4, 6, 7])]

In [30]:
#tirando a aleatoridade sempre que executar a celula
np.random.seed(0)
train_test_split(X_falso, test_size=0.5)

[array([6, 7, 3, 0, 5]), array([2, 8, 4, 9, 1])]

In [31]:
np.random.seed(0)
X_treino, X_valid, y_treino, y_valid = train_test_split(X, y, test_size=0.5)

In [32]:
X_treino.head()

Unnamed: 0,Sex_binario,Age
213,0,30.0
586,0,47.0
499,0,24.0
821,0,27.0
246,1,25.0


In [34]:
X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape

((445, 2), (446, 2), (445,), (446,))