In [186]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Perceptron

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# CARREGANDO DADOS

In [187]:
dados_treino = pd.read_csv("titanic_train.csv")
dados_teste = pd.read_csv("titanic_test.csv")
dados_treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# ANLISANDO DADOS EM GERAL

In [188]:
dados_treino.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [189]:
dados_treino.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [190]:
for column in dados_treino.select_dtypes(include=['object']):
    print('\n', dados_treino.groupby([column]).size())


 Name
Abbing, Mr. Anthony                      1
Abbott, Mr. Rossmore Edward              1
Abbott, Mrs. Stanton (Rosa Hunt)         1
Abelson, Mr. Samuel                      1
Abelson, Mrs. Samuel (Hannah Wizosky)    1
                                        ..
de Mulder, Mr. Theodore                  1
de Pelsmaeker, Mr. Alfons                1
del Carlo, Mr. Sebastiano                1
van Billiard, Mr. Austin Blyler          1
van Melkebeke, Mr. Philemon              1
Length: 891, dtype: int64

 Sex
female    314
male      577
dtype: int64

 Ticket
110152         3
110413         3
110465         2
110564         1
110813         1
              ..
W./C. 6608     4
W./C. 6609     1
W.E.P. 5734    1
W/C 14208      1
WE/P 5735      2
Length: 681, dtype: int64

 Cabin
A10    1
A14    1
A16    1
A19    1
A20    1
      ..
F33    3
F38    1
F4     2
G6     4
T      1
Length: 147, dtype: int64

 Embarked
C    168
Q     77
S    644
dtype: int64


# ANALISE E TRATANDO DADOS NULOS

In [191]:
print(dados_treino.isnull().sum())
print(dados_teste.isnull().sum())
dados_treino = dados_treino.dropna()
dados_teste = dados_teste.dropna()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# REMOVENDO COLUNAS INUTEIS

In [192]:
dados_treino.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
dados_teste.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
dados_treino

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1000,S
6,0,1,male,54.0,0,0,51.8625,S
10,1,3,female,4.0,1,1,16.7000,S
11,1,1,female,58.0,0,0,26.5500,S
...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S
872,0,1,male,33.0,0,0,5.0000,S
879,1,1,female,56.0,0,1,83.1583,C
887,1,1,female,19.0,0,0,30.0000,S


# AJUSTANDO DADOS CATEGORICOS

In [193]:
for coluna in dados_treino.columns:
    if dados_treino[coluna].dtype == 'object':
        tipos = dados_treino[coluna].unique()
        print(f"\nColuna '{coluna}':\nTipos possíveis: {tipos}")


Coluna 'Sex':
Tipos possíveis: ['female' 'male']

Coluna 'Embarked':
Tipos possíveis: ['C' 'S' 'Q']


In [194]:
# categoricos binarios -> um ou outro
label = LabelEncoder()
dados_treino["Sex"] = label.fit_transform(dados_treino["Sex"])
dados_teste["Sex"] = label.fit_transform(dados_teste["Sex"])
dados_treino

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,0,38.0,1,0,71.2833,C
3,1,1,0,35.0,1,0,53.1000,S
6,0,1,1,54.0,0,0,51.8625,S
10,1,3,0,4.0,1,1,16.7000,S
11,1,1,0,58.0,0,0,26.5500,S
...,...,...,...,...,...,...,...,...
871,1,1,0,47.0,1,1,52.5542,S
872,0,1,1,33.0,0,0,5.0000,S
879,1,1,0,56.0,0,1,83.1583,C
887,1,1,0,19.0,0,0,30.0000,S


In [195]:
# categoricos com mais de duas opçoes
encoder = OneHotEncoder(handle_unknown='ignore')

one_hot_encond_temp = pd.DataFrame(encoder.fit_transform(dados_treino[['Embarked']]).toarray())
dados_treino = dados_treino.iloc[:, :-1] # remove coluna usada
pd.concat([dados_treino, one_hot_encond_temp], axis=1, ignore_index=False) #concatena as colunas

one_hot_encond_temp = pd.DataFrame(encoder.fit_transform(dados_teste[['Embarked']]).toarray())
dados_teste = dados_teste.iloc[:, :-1] # remove coluna usada
pd.concat([dados_teste, one_hot_encond_temp], axis=1, ignore_index=False)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,0,1,2
12,1.0,0.0,23.0,1.0,0.0,82.2667,1.0,0.0,0.0
14,1.0,0.0,47.0,1.0,0.0,61.1750,1.0,0.0,0.0
24,1.0,0.0,48.0,1.0,3.0,262.3750,0.0,0.0,1.0
26,1.0,0.0,22.0,0.0,1.0,61.9792,0.0,0.0,1.0
28,1.0,1.0,41.0,0.0,0.0,30.5000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
82,,,,,,,1.0,0.0,0.0
83,,,,,,,1.0,0.0,0.0
84,,,,,,,1.0,0.0,0.0
85,,,,,,,0.0,1.0,0.0


# REMOVENDO COLUNAS NULASGERADAS PELO ONE_HOT

In [196]:
dados_treino.dropna(inplace=True)
dados_teste.dropna(inplace=True)
dados_treino

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
1,1,1,0,38.0,1,0,71.2833
3,1,1,0,35.0,1,0,53.1000
6,0,1,1,54.0,0,0,51.8625
10,1,3,0,4.0,1,1,16.7000
11,1,1,0,58.0,0,0,26.5500
...,...,...,...,...,...,...,...
871,1,1,0,47.0,1,1,52.5542
872,0,1,1,33.0,0,0,5.0000
879,1,1,0,56.0,0,1,83.1583
887,1,1,0,19.0,0,0,30.0000


In [199]:
dados_teste

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
12,1,0,23.0,1,0,82.2667
14,1,0,47.0,1,0,61.1750
24,1,0,48.0,1,3,262.3750
26,1,0,22.0,0,1,61.9792
28,1,1,41.0,0,0,30.5000
...,...,...,...,...,...,...
404,1,1,43.0,1,0,27.7208
405,2,1,20.0,0,0,13.8625
407,1,1,50.0,1,1,211.5000
411,1,0,37.0,1,0,90.0000


# SEPARANDO OS DADOS PARA TREINO E TESTE

In [200]:
XTrain = dados_treino.iloc[:, 1:]
yTrain = dados_treino.iloc[:, 0]

XTest = dados_teste
XTrain

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
1,1,0,38.0,1,0,71.2833
3,1,0,35.0,1,0,53.1000
6,1,1,54.0,0,0,51.8625
10,3,0,4.0,1,1,16.7000
11,1,0,58.0,0,0,26.5500
...,...,...,...,...,...,...
871,1,0,47.0,1,1,52.5542
872,1,1,33.0,0,0,5.0000
879,1,0,56.0,0,1,83.1583
887,1,0,19.0,0,0,30.0000


# AJUSTANOD INTEVALO DOS DADOS NUMERICOS

In [201]:
sc = StandardScaler()
sc.fit(XTrain)

XTrain = sc.transform(XTrain.values)
XTest = sc.transform(XTest.values)

XTrain



array([[-0.37225618, -1.03901177,  0.14906507,  0.83362754, -0.63172982,
        -0.0971798 ],
       [-0.37225618, -1.03901177, -0.0432295 ,  0.83362754, -0.63172982,
        -0.3359971 ],
       [-0.37225618,  0.96245301,  1.17463611, -0.7230443 , -0.63172982,
        -0.35225028],
       ...,
       [-0.37225618, -1.03901177,  1.30283248, -0.7230443 ,  0.69708118,
         0.05878503],
       [-0.37225618, -1.03901177, -1.06880054, -0.7230443 , -0.63172982,
        -0.63938976],
       [-0.37225618,  0.96245301, -0.62011321, -0.7230443 , -0.63172982,
        -0.63938976]])

# CRIANDO MODELO DA REDE NEURAL

In [202]:
# max_iter = numero de interações
# eta= taxa de aprendizado inicial
# define a semente para garantir a reprodutibilidade
modelo = Perceptron(max_iter=1000, eta0=0.1, random_state=0) 
modelo.fit(XTrain, yTrain)

# ANALISANDO RESULTADOS

In [204]:
yPred = modelo.predict(XTest)
yPred

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1],
      dtype=int64)

In [206]:
confusion_matrix(teste_backup, yPred)

TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
accuracy_score(y_test, yPred)

0.5770392749244713

In [None]:
modelo.score(XTest, y_test)

0.5770392749244713