In [34]:
!pip install --upgrade xlrd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
!pip install --upgrade pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Bibliotecas**

In [36]:
import pandas as pd 
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics

**Carregamento e leitura dos dados**

In [37]:
url = "https://hbiostat.org/data/repo/titanic3.xls"

In [38]:
df = pd.read_excel(url)

In [39]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [40]:
df.shape

(1309, 14)

**Entendimento dos dados**

In [41]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [42]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [43]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [44]:
(df.isnull().sum().sort_values(ascending=False)*100)/df.shape[0]

body         90.756303
cabin        77.463713
boat         62.872422
home.dest    43.086325
age          20.091673
embarked      0.152788
fare          0.076394
pclass        0.000000
survived      0.000000
name          0.000000
sex           0.000000
sibsp         0.000000
parch         0.000000
ticket        0.000000
dtype: float64

In [45]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [46]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

**Limpeza dos dados**

In [47]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [48]:
df.age = df.age.fillna(df.age.median()) #Adicionando mediana da idade nos valores missing

In [49]:
df = df.fillna({"embarked": "S"}) #Adicionando local de embarque S nos valores missing

In [50]:
df = df.drop(columns = ["name", "ticket", "boat", "cabin", "home.dest", "body", "fare"])

In [51]:
df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'embarked'], dtype='object')

**Visualização dos dados**

In [52]:
fig = px.bar(df, x = 'sex', 
             title = 'Passageiros por sexo',
             width=600, 
             height=400, 
             template="simple_white") 
fig.show()

In [53]:
fig = px.bar(df, x = 'embarked', 
             title = 'Passageiros por lugar de embarque',
             width=600, 
             height=400, 
             template="simple_white") 
fig.show()

In [54]:
fig = px.bar(df, x = 'sex',  
             y = 'survived', 
             title = 'Sobreviventes por sexo',
             width=600, 
             height=400, 
             template="simple_white") 
fig.show()

**Ajustes nos dados**

In [55]:
#Mudando o sexo para valor numérico
mapeamento_sexo = {"male": 0, "female": 1}
df['sex'] = df['sex'].map(mapeamento_sexo)

In [56]:
#Mudando local de embarque para valor numérico
mapeamento_embarque = {"S": 1, "C": 2, "Q": 3}
df['embarked'] = df['embarked'].map(mapeamento_embarque)

**Separação dos dados (treino/teste)**

In [57]:
y = df.survived #rotulos
X = df.drop(columns = "survived") #atributos

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

**Baseline (dummy)**

In [59]:
#Stratified
model_dummy_stratified = DummyClassifier(strategy = 'stratified')
model_dummy_stratified.fit(X_train, y_train)
acuracia_dummy_stratified = model_dummy_stratified.score(X_test, y_test) * 100
print("Acuracia do Dummy Stratified: %.2f" % acuracia_dummy_stratified)

#Most frequent
model_dummy_most_frequent = DummyClassifier(strategy = 'most_frequent')
model_dummy_most_frequent.fit(X_train, y_train)
acuracia_dummy_most_frequent = model_dummy_most_frequent.score(X_test, y_test) * 100
print("Acuracia do Dummy most_frequent: %.2f" % acuracia_dummy_most_frequent)

#Prior
model_dummy_prior = DummyClassifier(strategy = 'prior')
model_dummy_prior.fit(X_train, y_train)
acuracia_dummy_prior = model_dummy_prior.score(X_test, y_test) * 100
print("Acuracia do Dummy prior: %.2f" % acuracia_dummy_prior)

#Uniform
model_dummy_uniform = DummyClassifier(strategy = 'uniform')
model_dummy_uniform.fit(X_train, y_train)
acuracia_dummy_uniform = model_dummy_uniform.score(X_test, y_test) * 100
print("Acuracia do Dummy uniform: %.2f" % acuracia_dummy_uniform)


Acuracia do Dummy Stratified: 51.15
Acuracia do Dummy most_frequent: 57.00
Acuracia do Dummy prior: 57.00
Acuracia do Dummy uniform: 50.13


**Gaussian Naive Bayes**

In [60]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_previsao_gaussian = gaussian.predict(X_test)
acuracia_gaussian = accuracy_score(y_previsao_gaussian, y_test) * 100
print("Acuracia do Gaussian NB: %.2f" % acuracia_gaussian)

Acuracia do Gaussian NB: 75.06


**Support Vector Machines (SVM)**

In [61]:
svc = SVC()
svc.fit(X_train, y_train)
y_previsao_svm = svc.predict(X_test)
acuracia_svm = accuracy_score(y_previsao_svm, y_test) * 100
print("Acuracia do SVM: %.2f" % acuracia_svm)

Acuracia do SVM: 59.54


**Decision Tree Classifier**

In [63]:
decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)
y_previsao_decisiontree = decisiontree.predict(X_test)
acuracia_decisiontree = accuracy_score(y_previsao_decisiontree, y_test) * 100
print("Acuracia do Decision Tree Classifier: %.2f" % acuracia_decisiontree)

Acuracia do Decision Tree Classifier: 76.08


**Random Forest Classifier**

In [65]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
y_previsao_randomforest = randomforest.predict(X_test)
acuracia_randomforest = accuracy_score(y_previsao_randomforest, y_test) * 100
print("Acuracia do Random Forest Classifier: %.2f" % acuracia_randomforest)

Acuracia do Random Forest Classifier: 76.59


**k-nearest neighbors (KNN)**

In [66]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_previsao_knn = knn.predict(X_test)
acuracia_knn = accuracy_score(y_previsao_knn, y_test) * 100
print("Acuracia do KNN: %.2f" % acuracia_knn)

Acuracia do KNN: 75.06


**Regressão logistica**

In [67]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_previsao_logreg = logreg.predict(X_test)
acuracia_logreg = accuracy_score(y_previsao_logreg, y_test) * 100
print("Acuracia da Regressao Logistica: %.2f" % acuracia_logreg)

Acuracia da Regressao Logistica: 80.15


**Comparando modelos**

In [68]:
modelos = pd.DataFrame({
    'Modelo': ['Gaussian Naive Bayes', 'Support Vector Machines (SVM)','Decision Tree','Random Forest',
               'KNN', 'Regressao Logistica', 'Dummy Stratified', 'Dummy Most frequent', 'Dummy Prior', 'Dummy Uniform', 
              ],
    'Acuracia': [acuracia_gaussian, acuracia_svm, acuracia_decisiontree, acuracia_randomforest, 
                 acuracia_knn, acuracia_logreg, acuracia_dummy_stratified, acuracia_dummy_most_frequent, acuracia_dummy_prior, acuracia_dummy_uniform]})
modelos.sort_values(by='Acuracia', ascending=False)

Unnamed: 0,Modelo,Acuracia
5,Regressao Logistica,80.152672
3,Random Forest,76.590331
2,Decision Tree,76.081425
0,Gaussian Naive Bayes,75.063613
4,KNN,75.063613
1,Support Vector Machines (SVM),59.541985
7,Dummy Most frequent,56.997455
8,Dummy Prior,56.997455
6,Dummy Stratified,51.145038
9,Dummy Uniform,50.127226
