In [292]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans

In [384]:
df = pd.read_csv('data/titanic.csv')

In [385]:
df.head()
# df.columns
# df.shape
# df.SibSp.value_counts()
data = df[['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Cabin', 'Embarked', 'Age', 'Survived', 'Name']]
feature_cols = ['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Cabin', 'Embarked', 'Age']
target_col = ['Survived']

In [386]:
# data.dtypes
data.head()

data.isna().sum() # null, na, Nan, None
## Embarked의 빈 row 제거
data = data.loc[~data.Embarked.isna(),:]

## Cabin은 column을 제거
# data = data.loc[:,['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Embarked', 'Age']]
data.drop('Cabin', axis=1, inplace=True)

## Age는 데이터를 imputation
# data.loc[data.Age.isna(),'Age'] = data.Age.mean()
data.Age.fillna(data.Age.mean(),inplace=True)


# 생존자 나이 평균을 생존자 나이에 넣고
new_data = df.groupby(['Survived','Sex']).mean().Age.reset_index()

# 사망자 나이 평균을 사망자 나이에
# df.loc[df.Age.isna(),:]
data.loc[(data.Survived==0) & (data.Sex=='female') & (data.Age.isna()), 'Age'] = new_data.loc[(new_data.Survived==0) & (new_data.Sex=='female'), 'Age'].values[0]
data.loc[(data.Survived==1) & (data.Sex=='female') & (data.Age.isna()), 'Age'] = new_data.loc[(new_data.Survived==1) & (new_data.Sex=='female'), 'Age'].values[0]
data.loc[(data.Survived==0) & (data.Sex=='male') & (data.Age.isna()), 'Age'] = new_data.loc[(new_data.Survived==0) & (new_data.Sex=='male'), 'Age'].values[0]
data.loc[(data.Survived==1) & (data.Sex=='male') & (data.Age.isna()), 'Age'] = new_data.loc[(new_data.Survived==1) & (new_data.Sex=='male'), 'Age'].values[0]
data.reset_index(drop=True, inplace=True)


In [387]:
# plt.boxplot(data.Age)
# plt.hist(data.Age)

# np.percentile(data.Age, 50)

In [388]:
# pd.cut(data.Age, [0, np.percentile(data.Age, 25), np.percentile(data.Age, 75), np.percentile(data.Age, 90), data.Age.max()])
## percentile을 사용해서 Age를 그룹화

newAge = np.where(data.Age > np.percentile(data.Age, 90), 0,
                np.where(data.Age > np.percentile(data.Age, 75), 1,
                        np.where(data.Age > np.percentile(data.Age, 25), 2,
                                                         np.where(data.Age > np.percentile(data.Age, 10), 3, 4))))
# ## clustering을 사용해서 Age를 그룹화
# km = KMeans(4)
# km.fit(data[['Age']])
# ageCluster = km.labels_

# ageClusterOh = np.zeros((ageCluster.size, np.unique(ageCluster).size))
# for k,v in enumerate(ageCluster):
#     ageClusterOh[k,v] = 1


newAgeOh = np.zeros((newAge.size, np.unique(newAge).size))
for k,v in enumerate(newAge):
    newAgeOh[k,v] = 1
    

    
newPclass = np.zeros((data.Pclass.size, data.Pclass.unique().size))
for k, v in enumerate(data.Pclass):
    newPclass[k, v-1] = 1
    
newSibSp = np.where(data.SibSp == 0, 0, 1)

mmScaledFare = (data.Fare - data.Fare.min()) / (data.Fare.max() - data.Fare.min())

newEmbarked = np.where(data.Embarked == data.Embarked.unique()[0],0,
                        np.where(data.Embarked == data.Embarked.unique()[1],1,2))
newEmbarkedOh = np.zeros((newEmbarked.size, np.unique(newEmbarked).size))
for k,v in enumerate(newEmbarked):
    newEmbarkedOh[k, v] = 1

newSex = np.where(data.Sex == data.Sex.unique()[0],0,1)

## 결혼 유무
married = np.where((data.Name.str.contains('Mrs.')) | (data.Name.str.contains('Mr.') & (data.SibSp > 0)), 1, 0)

newCol = np.c_[newPclass, newSibSp, mmScaledFare, newAgeOh, newEmbarkedOh, newSex, married]
newData = pd.concat([pd.DataFrame(newCol), data], axis=1)
newData.drop(['Age', 'Sex', 'Embarked', 'Fare', 'Pclass', 'Name'], axis=1, inplace=True)


In [390]:
newData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,SibSp,Parch,Survived
0,0.0,0.0,1.0,1.0,0.014151,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0
1,1.0,0.0,0.0,1.0,0.139136,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1,0,1
2,0.0,0.0,1.0,0.0,0.015469,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,1
3,1.0,0.0,0.0,1.0,0.103644,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,0,1
4,0.0,0.0,1.0,0.0,0.015713,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,1.0,0.0,0.0,0.025374,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
885,1.0,0.0,0.0,0.0,0.058556,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,1
886,0.0,0.0,1.0,1.0,0.045771,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,2,0
887,1.0,0.0,0.0,0.0,0.058556,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,1


In [299]:
X = newData.iloc[:,:-1]
y = newData.iloc[:,-1]

In [419]:
## train test 분리
# 8:2 (5000건 이상이면 7:3도 가능)

new_index = np.random.choice(X.index, X.index.size, replace=False)

In [420]:
tr_idx = new_index[:int(new_index.size*0.8)]
te_idx = new_index[int(new_index.size*0.8):]
X_train = X.loc[tr_idx,:]
X_test = X.loc[te_idx,:]
y_train = y[tr_idx]
y_test = y[te_idx]

In [391]:
logreg = LogisticRegression()
rf_model = RandomForestClassifier()
svm_model = SVC(probability=True)

In [392]:
cross_val_score(logreg,X,y,cv=5).mean()

0.8020504030978225

In [398]:
cross_val_score(rf_model,X,y,cv=5).mean()

0.7953024820669079

In [305]:
logreg = LogisticRegression()

In [395]:
logreg.fit(X,y)

LogisticRegression()

In [399]:
rf_model.fit(X,y)

RandomForestClassifier()

In [308]:
svm_model.fit(X,y)

SVC(probability=True)

In [396]:
logreg.predict_proba(X)[2,:]
# logreg.predict(X)[2]

array([0.39075193, 0.60924807])

In [397]:
y[2]

1

In [400]:
rf_model.predict_proba(X.loc[[2],:])

array([[0.08988095, 0.91011905]])

In [401]:
rf_model.predict_proba(X)[2,:]

array([0.08988095, 0.91011905])

In [311]:
logreg.predict_proba(X.loc[[2],:])

array([[0.39075193, 0.60924807]])

In [312]:
svm_model.predict_proba(X.loc[[2],:])

array([[0.68342208, 0.31657792]])

In [313]:
## 나이와 성별

In [314]:
logreg.predict(X)[np.abs(logreg.predict_proba(X)[:,0] - logreg.predict_proba(X)[:,1]) <= 0.1]

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [315]:
rf_model.feature_importances_[np.argmax(rf_model.feature_importances_)]

0.2660285245361509

In [316]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,SibSp,Parch,Fare
0,0.0,0.0,1.0,1.0,0.014151,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,7.2500
1,1.0,0.0,0.0,1.0,0.139136,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,0,71.2833
2,0.0,0.0,1.0,0.0,0.015469,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,0,7.9250
3,1.0,0.0,0.0,1.0,0.103644,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,0,53.1000
4,0.0,0.0,1.0,0.0,0.015713,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,1.0,0.0,0.0,0.025374,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0,13.0000
885,1.0,0.0,0.0,0.0,0.058556,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,30.0000
886,0.0,0.0,1.0,1.0,0.045771,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,2,23.4500
887,1.0,0.0,0.0,0.0,0.058556,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0,30.0000


In [317]:
## Accuracy로 평가
## 예측값과 실제값의 차이 두 값이 같은가(1) 다른가(0)
y_hat = logreg.predict(X_test) # 예측값
y #실제값

sum(y_hat == y_test)/len(y_test)

0.8033707865168539

In [321]:
## Fare 기준으로
## 아이, 0인 것 제외
## 상관성 0,1
## 선형회귀 사용


In [323]:
new_df = df.loc[df.Fare > 0, ['Fare', 'Survived']]

In [327]:
lm = LinearRegression()

In [330]:
lm.fit(new_df[['Fare']], new_df.Survived)

LinearRegression()

In [337]:
sum(np.where(lm.predict(new_df[['Fare']]) >= 0.5, 1, 0) == new_df.Survived) / new_df.Survived.size

0.6655251141552512

In [341]:
df[df.Name.str.contains('Mrs')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S


In [367]:
for i in range(len(df.Name)):
    x = np.array(df.Name.str.split(' ')[i])[np.where(np.array(df.Name.str.split(' ')[i]) == 'Mr.')[0]+1]

In [432]:
rf_model = RandomForestClassifier()
svm_model = SVC(probability=True)
lr_model = LogisticRegression()

In [433]:
rf_model.fit(X,y)
svm_model.fit(X,y)
lr_model.fit(X,y)

LogisticRegression()

In [437]:
rf_pr = rf_model.predict(X)
svm_pr = svm_model.predict(X)
lr_pr = lr_model.predict(X)

In [438]:
rf_prb = rf_model.predict_proba(X)
svm_prb = svm_model.predict_proba(X)
lr_prb = lr_model.predict_proba(X)

In [441]:
soft_vote_result = (rf_prb + svm_prb + lr_prb)/3
soft_vote = np.array((soft_vote_result[:,0] < soft_vote_result[:,1]), dtype='int')

In [428]:
hard_vote = np.where((rf_pr + svm_pr + lr_pr) > 1, 1, 0)

In [445]:
hard_vote_acc = sum(hard_vote == y)/y.size
soft_vote_acc = sum(soft_vote == y)/y.size

In [446]:
rf_acc = sum(rf_pr==y)/y.size
svm_acc = sum(svm_pr==y)/y.size
lr_acc = sum(lr_pr==y)/y.size

In [447]:
print('rf: ', rf_acc)
print('svm: ', svm_acc)
print('lr: ', lr_acc)
print('hard_vote: ', hard_vote_acc)
print('soft_vote: ', soft_vote_acc)

rf:  0.9448818897637795
svm:  0.6771653543307087
lr:  0.81214848143982
hard_vote:  0.8830146231721034
soft_vote:  0.905511811023622


In [422]:
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [423]:
rf_model.score(X_test, y_test)

0.8202247191011236