In [93]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import cufflinks as cf
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from impyute.imputation.cs import mice

%matplotlib inline
matplotlib.style.use('ggplot') 

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
init_notebook_mode(connected=True)

In [2]:
# Iniciando análise preliminar dos dados
df = pd.read_csv("train.csv") 

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
df = df.drop(['Ticket','Cabin'], axis=1)
df = df.dropna() 

In [5]:
df.describe(include=['O'])

Unnamed: 0,Name,Sex,Embarked
count,712,712,712
unique,712,2,3
top,"Dowdell, Miss. Elizabeth",male,S
freq,1,453,554


In [6]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.652174
1,2,0.479769
2,3,0.239437


In [7]:
df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.752896
1,male,0.205298


In [8]:
df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.530055
2,2,0.44
0,0,0.36887
3,3,0.333333
4,4,0.166667
5,5,0.0


In [9]:
df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
2,2,0.573529
1,1,0.554545
0,0,0.354528
5,5,0.2
4,4,0.0
6,6,0.0


In [10]:
val = df["Survived"].value_counts().values.tolist()
trace = go.Pie(
                values=val, 
                marker=dict(colors=['red']),
                labels = ["Morreu","Sobreviveu"],
                hoverinfo="value"
              )
data = [trace]

layout = go.Layout(title="Destribuição de Sobreviventes")

fig = go.Figure(data = data,layout = layout)

fig.show()

In [11]:
trace = go.Histogram(x=df['Age'],nbinsx=40,histnorm='percent')
data = [trace]
layout = go.Layout(title="Destribuição de Idade")
fig1 = go.Figure(data = data,layout = layout)

fig1.show()

In [12]:
trace = go.Scatter(x = df['Age'],y=df['Fare'],text = df['Survived'],mode='markers')
data=[trace]
layout = go.Layout(title='Custo da Passagem Vs Idade',xaxis=dict(title='Idade'),yaxis=dict(title='Custo da Passagem'),hovermode='closest')
figure = go.Figure(data=data,layout=layout)

figure.show()

In [13]:
trace = go.Bar(x = list(df['Pclass'].unique()),y=list(df['Age']),marker=dict(colorscale='Viridis',showscale=True))
data=[trace]
layout = go.Layout(title='Média de Idade vs Pclass',xaxis=dict(title='Pclass'),yaxis=dict(title='Idade'),hovermode='closest')
figure1 = go.Figure(data=data,layout=layout)

figure1.show()

In [14]:
a = df[df['Pclass']==1]['Fare']
b = df[df['Pclass']==2]['Fare']
c = df[df['Pclass']==3]['Fare']
hist_data=[a,b,c]
group_labels=['1','2','3']

fig2 = ff.create_distplot(hist_data,group_labels,bin_size=
[1,1,1],show_curve=False)
fig2.update_layout(title_text='Destribuição por Custo de Passagem separado por classe do Passageiro')

fig2.show()

In [15]:
sHomem = df["Survived"][df["Sex"] == 'male'].value_counts(normalize = True)
sMulher = df["Survived"][df["Sex"] == 'female'].value_counts(normalize = True)
x0 = ['Homem', 'Mulher']
y0 = [sHomem[1], sMulher[1]]
data = [go.Bar(
        x=x0,
        y=y0
    )]
layout = go.Layout(autosize = False, width = 300, height = 400,
              yaxis = dict(title = 'Taxa de Sobrevivência'),
              title = 'Sobrevivência por Sexo')
fig3 = go.Figure(data = data, layout = layout)

fig3.show()

In [16]:
# Verificar a quantidade de NaN por Coluna
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [17]:
# Começar a normalizar os Dados agora
# Reiniciar o Dataframe
df = pd.read_csv("train.csv")

# Dar drop em Cabine, Ticket e Name
# Por serem valores categóricos que não tem utilidade, visto que ou são identificadores no caso de Nome e Ticket
# Ou no caso de Cabine ter muitos NaN que não são possíveis de preencher
df = df.drop(['Ticket','Cabin', 'Name'], axis=1)

# Criando o Scaler
scaler = MinMaxScaler()

# Normalizando Sex com 0 para female e 1 para male
df['Sex'].replace(['female','male'],[0,1],inplace=True)

# Normalizando PassengerId
df['PassengerId'] = scaler.fit_transform(df['PassengerId'].values.reshape(-1,1))

# Normalizando Pclass
df['Pclass'] = scaler.fit_transform(df['Pclass'].values.reshape(-1,1))

# Normalizando Age
df['Age'] = scaler.fit_transform(df['Age'].values.reshape(-1,1))

# Normalizando PassengerId
df['SibSp'] = scaler.fit_transform(df['SibSp'].values.reshape(-1,1))

# Normalizando Parch
df['Parch'] = scaler.fit_transform(df['Parch'].values.reshape(-1,1))

# Normalizando Fare
df['Fare'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1))

# Normalizando Embarked S=0, C=0.5, Q=1
df['Embarked'].replace(['S','C','Q'],[0,0.5,1],inplace=True)

df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.000000,0,1.0,1,0.271174,0.125,0.000000,0.014151,0.0
1,0.001124,1,0.0,0,0.472229,0.125,0.000000,0.139136,0.5
2,0.002247,1,1.0,0,0.321438,0.000,0.000000,0.015469,0.0
3,0.003371,1,0.0,0,0.434531,0.125,0.000000,0.103644,0.0
4,0.004494,0,1.0,1,0.434531,0.000,0.000000,0.015713,0.0
...,...,...,...,...,...,...,...,...,...
886,0.995506,0,0.5,1,0.334004,0.000,0.000000,0.025374,0.0
887,0.996629,1,0.0,0,0.233476,0.000,0.000000,0.058556,0.0
888,0.997753,0,1.0,0,,0.125,0.333333,0.045771,0.0
889,0.998876,1,0.0,1,0.321438,0.000,0.000000,0.058556,0.5


In [18]:
# Preenchendo os vazios utilizando Multivariate Imputation by Chained Equation (MICE)
df = pd.DataFrame(data=mice(df.values), columns=df.columns, index=df.index)

df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.000000,0.0,1.0,1.0,0.271174,0.125,0.000000,0.014151,0.0
1,0.001124,1.0,0.0,0.0,0.472229,0.125,0.000000,0.139136,0.5
2,0.002247,1.0,1.0,0.0,0.321438,0.000,0.000000,0.015469,0.0
3,0.003371,1.0,0.0,0.0,0.434531,0.125,0.000000,0.103644,0.0
4,0.004494,0.0,1.0,1.0,0.434531,0.000,0.000000,0.015713,0.0
...,...,...,...,...,...,...,...,...,...
886,0.995506,0.0,0.5,1.0,0.334004,0.000,0.000000,0.025374,0.0
887,0.996629,1.0,0.0,0.0,0.233476,0.000,0.000000,0.058556,0.0
888,0.997753,0.0,1.0,0.0,0.292313,0.125,0.333333,0.045771,0.0
889,0.998876,1.0,0.0,1.0,0.321438,0.000,0.000000,0.058556,0.5


In [19]:
# Verificar a quantidade de NaN por Coluna para ver se preenchemos corretamente
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [20]:
# Matriz de Correlação
df.corr().iplot(kind='heatmap',colorscale="Blues",title="Matriz de Correlação")

In [21]:
df.iplot(kind="bubble", x="Fare", y="Age",categories="Survived", size='Pclass', xTitle='Fare', yTitle='Age')


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



In [53]:
X = df.loc[:, df.columns != 'Survived']
y = df.loc[:, df.columns == 'Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Através de vários testes 17 e 19 foram os valores de n_neighbors que retornam o score mais alto
# Foram realizados teste utilizando valores de 1 a 20
knn1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
knn2 = KNeighborsClassifier(n_neighbors=2, weights='uniform')
knn3 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
knn4 = KNeighborsClassifier(n_neighbors=4, weights='uniform')
knn5 = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn6 = KNeighborsClassifier(n_neighbors=6, weights='uniform')
knn7 = KNeighborsClassifier(n_neighbors=7, weights='uniform')
knn8 = KNeighborsClassifier(n_neighbors=8, weights='uniform')
knn9 = KNeighborsClassifier(n_neighbors=9, weights='uniform')
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn11 = KNeighborsClassifier(n_neighbors=11, weights='uniform')
knn12 = KNeighborsClassifier(n_neighbors=12, weights='uniform')
knn13 = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn14 = KNeighborsClassifier(n_neighbors=14, weights='uniform')
knn15 = KNeighborsClassifier(n_neighbors=15, weights='uniform')
knn16 = KNeighborsClassifier(n_neighbors=16, weights='uniform')
knn17 = KNeighborsClassifier(n_neighbors=17, weights='uniform')
knn18 = KNeighborsClassifier(n_neighbors=18, weights='uniform')
knn19 = KNeighborsClassifier(n_neighbors=19, weights='uniform')
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

#Values ravel é necessário para transformar uma coluna em um array numpy
knn1.fit(X_train, y_train.values.ravel())
knn2.fit(X_train, y_train.values.ravel())
knn3.fit(X_train, y_train.values.ravel())
knn4.fit(X_train, y_train.values.ravel())
knn5.fit(X_train, y_train.values.ravel())
knn6.fit(X_train, y_train.values.ravel())
knn7.fit(X_train, y_train.values.ravel())
knn8.fit(X_train, y_train.values.ravel())
knn9.fit(X_train, y_train.values.ravel())
knn10.fit(X_train, y_train.values.ravel())
knn11.fit(X_train, y_train.values.ravel())
knn12.fit(X_train, y_train.values.ravel())
knn13.fit(X_train, y_train.values.ravel())
knn14.fit(X_train, y_train.values.ravel())
knn15.fit(X_train, y_train.values.ravel())
knn16.fit(X_train, y_train.values.ravel())
knn17.fit(X_train, y_train.values.ravel())
knn18.fit(X_train, y_train.values.ravel())
knn19.fit(X_train, y_train.values.ravel())
knn20.fit(X_train, y_train.values.ravel())

print("1: " + str(knn1.score(X_test, y_test)))
print("2: " + str(knn2.score(X_test, y_test)))
print("3: " + str(knn3.score(X_test, y_test)))
print("4: " + str(knn4.score(X_test, y_test)))
print("5: " + str(knn5.score(X_test, y_test)))
print("6: " + str(knn6.score(X_test, y_test)))
print("7: " + str(knn7.score(X_test, y_test)))
print("8: " + str(knn8.score(X_test, y_test)))
print("9: " + str(knn9.score(X_test, y_test)))
print("10: " + str(knn10.score(X_test, y_test)))
print("11: " + str(knn11.score(X_test, y_test)))
print("12: " + str(knn12.score(X_test, y_test)))
print("13: " + str(knn13.score(X_test, y_test)))
print("14: " + str(knn14.score(X_test, y_test)))
print("15: " + str(knn15.score(X_test, y_test)))
print("16: " + str(knn16.score(X_test, y_test)))
print("17: " + str(knn17.score(X_test, y_test)))
print("18: " + str(knn18.score(X_test, y_test)))
print("19: " + str(knn19.score(X_test, y_test)))
print("20: " + str(knn20.score(X_test, y_test)))

knn = KNeighborsClassifier(n_neighbors=19, weights='uniform')
knn.fit(X_train, y_train.values.ravel())

1: 0.7835820895522388
2: 0.7910447761194029
3: 0.8022388059701493
4: 0.8022388059701493
5: 0.8059701492537313
6: 0.7910447761194029
7: 0.7985074626865671
8: 0.7985074626865671
9: 0.8097014925373134
10: 0.8059701492537313
11: 0.8097014925373134
12: 0.8059701492537313
13: 0.8059701492537313
14: 0.8022388059701493
15: 0.8059701492537313
16: 0.8097014925373134
17: 0.8134328358208955
18: 0.8097014925373134
19: 0.8134328358208955
20: 0.8022388059701493


In [72]:
score = []

for i in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train.values.ravel())
    score.append(knn.score(X_test, y_test))
    
fig = go.Figure(data=go.Scatter(x=np.arange(1,21), y=score, mode='lines+markers'))
fig.update_layout(title='Score vs Valor de K',
                   xaxis_title='Valor de K',
                   yaxis_title='Score')
                
fig.show()