In [499]:
# Importando pandas
import pandas as pd

In [500]:
# Lendo e visualizando a base de dados Treino
train = pd.read_csv("Files/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [501]:
# Lendo e visualizando a base de dados Teste
test = pd.read_csv("Files/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Realizando os mesmos tratamentos iniciais feitos na primeira análise 

In [502]:
# Vizualisando a cardinalidade dos dados
train.nunique().sort_values(ascending=False)

PassengerId    891
Name           891
Ticket         681
Fare           248
Cabin          147
Age             88
SibSp            7
Parch            7
Pclass           3
Embarked         3
Survived         2
Sex              2
dtype: int64

In [503]:
# Eliminando colunas para os dados de treino
train = train.drop(["Name", "Ticket", "Cabin", "Embarked"], axis=1)

In [504]:
# Elimnando colunas para os dados de teste
test = test.drop(["Name", "Ticket", "Cabin", "Embarked"], axis=1)

### Tratando colunas com valores nulos

In [505]:
# Caculando a média da coluna Age
train["Age"].mean()

29.69911764705882

In [506]:
# Alterando os valores nulos na coluna Age, pela média base de treino
train.loc[train.Age.isnull(), "Age"] = train["Age"].mean()

In [507]:
# Alterando os valores nulos na coluna Age, pela média base de teste
test.loc[test.Age.isnull(), "Age"] = test["Age"].mean()

In [508]:
train.isnull().sum().sort_values(ascending=False)

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64

In [509]:
test.isnull().sum().sort_values(ascending=False)

Fare           1
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64

In [510]:
# Alterando valores nulos da coluna Fare
test.loc[test.Fare.isnull(), "Fare"] = test.Fare.mean()

In [511]:
test.isnull().sum().sort_values(ascending=False)

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64

----

 # Entendendo coluna de texto

- Vamos agora fazer o tratamento da coluna de texto

In [512]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


- Temos uma coluna de text, **Sex**

In [513]:
train["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

- Para coluna **Sex** podemos converter para binário por existir apenas 2 valores diferentes

In [514]:
# Convertendo a coluna Sex para binário
train['Sex_binary'] = train["Sex"].apply(lambda x: 1 if  x == "female" else 0)

In [515]:
# Fazendo a mesma conversão para base de teste
test['Sex_binary'] = test["Sex"].apply(lambda x: 1 if  x == "female" else 0)

In [516]:
# Eliminando a antiga coluna Sex das bases
train = train.drop(["Sex"], axis=1)
test = test.drop("Sex", axis=1)

# Criando Modelos

- Separando a base de treino entre Treino e Validação

In [535]:
# Serando X e y
X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]

X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_binary
0,3,22.000000,1,0,7.2500,0
1,1,38.000000,1,0,71.2833,1
2,3,26.000000,0,0,7.9250,1
3,1,35.000000,1,0,53.1000,1
4,3,35.000000,0,0,8.0500,0
...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0
887,1,19.000000,0,0,30.0000,1
888,3,29.699118,1,2,23.4500,1
889,1,26.000000,0,0,30.0000,0


In [536]:
# Importando train_test_split
from sklearn.model_selection import train_test_split

In [537]:
treino_X, valid_X, treino_y, valid_y = train_test_split(X, y, random_state=1)

- Modelo Random Forent

In [538]:
# Importando modelo e acurácia
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [539]:
# Treinando Random Forest
modelo_rf = RandomForestClassifier(random_state=1)
modelo_rf.fit(treino_X, treino_y)

In [540]:
# Predizendo os dados de validação
predicao_rf = modelo_rf.predict(valid_X)

In [541]:
# Acurácia do Modelo Random Forest
accuracy_score(predicao_rf, valid_y)

0.7757847533632287

- Modelo KNeighborsClassifer

In [542]:
# Importando modelo
from sklearn.neighbors import KNeighborsClassifier

In [543]:
# Treinando o Modelo
modelo_kn = KNeighborsClassifier(n_neighbors=3)
modelo_kn.fit(treino_X, treino_y)

In [544]:
## Predizendo os dados de validação
predicao_kn = modelo_kn.predict(valid_X)

In [545]:
# Acurácia
accuracy_score(valid_y, predicao_kn)

0.7085201793721974

- Modelo Regressão Logística

In [546]:
# Importando modelo
from sklearn.linear_model import LogisticRegression

In [568]:
# Treinando o Modelo
modelo_lr = LogisticRegression(random_state=1, max_iter=1000)
modelo_lr.fit(treino_X, treino_y)

In [566]:
# Predizendo os dados de Validação
predicao_lr = modelo_lr.predict(valid_X)

In [567]:
# Acurácia
accuracy_score(valid_y, predicao_lr)

0.8026905829596412

---

#  Predizendo os dados de Teste com o modelo Regressão Logística

In [553]:
X_train = train.drop(["PassengerId", "Survived"], axis=1)
y_train = train["Survived"]

In [559]:
X_test = test.drop("PassengerId", axis=1)

In [555]:
# Treinando o modelo Regressão Logística
modelo_final = LogisticRegression(random_state=1, max_iter=1000)
modelo_final.fit(X_train, y_train)

In [560]:
# Predizendo os dados de Validação
predicao_final = modelo_final.predict(X_test)

In [563]:
sub = pd.Series(predicao_final, index=test["PassengerId"], name="Survived")
sub.to_csv("Files/Segunda_predicao.csv", header=True)