In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold 

In [47]:
import xgboost as xgb
import lightgbm as lgb

In [48]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(len(train))
print(len(test))

891
418


1. 명목형 변수인 성별을 dummy 변수로 만들기

In [49]:
train= train.join(pd.get_dummies(train['Sex']).add_prefix('Sex_'))
test= test.join(pd.get_dummies(test['Sex']).add_prefix('Sex_'))

In [50]:
# drop axis의 경우 column이면 1, row이면 0이다.
train.drop('Sex', axis=1, inplace=True)
test.drop('Sex', axis=1, inplace=True)

In [51]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Sex_female       0
Sex_male         0
dtype: int64

In [52]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Sex_female       0
Sex_male         0
dtype: int64

2. PassengerId 필요 없으니 Column 삭제

3. 결측값이 있는 Embarked가 포함된 데이터 행 삭제 
<br>Embarked : 승선한 항

In [53]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [54]:
train['Embarked'].fillna('S',inplace=True)

In [55]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Sex_female       0
Sex_male         0
dtype: int64

In [56]:
train= train.join(pd.get_dummies(train['Embarked']).add_prefix('Embarked_'))
test= test.join(pd.get_dummies(test['Embarked']).add_prefix('Embarked_'))

In [57]:
train.drop('Embarked', axis=1, inplace=True)
test.drop('Embarked', axis=1, inplace=True)

In [58]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,0,1,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,1,0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,0,1,0,1,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,0,1,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,1,0,0,0,1


4. Name에서 Mr, Miss, Mrs 등등을 추출하자 - 어차피 이름 자체는 큰 의미가 없으니까..

In [59]:
train['Name']=train.Name.str.extract('([A-Za-z]+)\.') 
test['Name']=test.Name.str.extract('([A-Za-z]+)\.') 

In [60]:
train['Name'].unique() # 종류별로 확인
# MIIe - Miss
# Mme - Mrs
# Jonkheer - 귀족 직함

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [61]:
train['Name'].replace(['Mlle','Mme'],['Miss','Mrs'],inplace=True)
test['Name'].replace(['Mlle','Mme'],['Miss','Mrs'],inplace=True)

In [62]:
train[train['Name']=='Mrs']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
1,2,1,1,Mrs,38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
3,4,1,1,Mrs,35.0,1,0,113803,53.1000,C123,1,0,0,0,1
8,9,1,3,Mrs,27.0,0,2,347742,11.1333,,1,0,0,0,1
9,10,1,2,Mrs,14.0,1,0,237736,30.0708,,1,0,1,0,0
15,16,1,2,Mrs,55.0,0,0,248706,16.0000,,1,0,0,0,1
18,19,0,3,Mrs,31.0,1,0,345763,18.0000,,1,0,0,0,1
19,20,1,3,Mrs,,0,0,2649,7.2250,,1,0,1,0,0
25,26,1,3,Mrs,38.0,1,5,347077,31.3875,,1,0,0,0,1
31,32,1,1,Mrs,,1,0,PC 17569,146.5208,B78,1,0,1,0,0
40,41,0,3,Mrs,40.0,1,0,7546,9.4750,,1,0,0,0,1


In [63]:
train['Name'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Ms','Rev','Sir'],['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other'],inplace=True)
test['Name'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Ms','Rev','Sir', 'Dona'],['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other'],inplace=True)

In [64]:
test['Name'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Other'], dtype=object)

In [65]:
train.groupby('Name')['Age'].mean()

Name
Master     4.574167
Miss      21.804054
Mr        32.368090
Mrs       35.788991
Other     44.782609
Name: Age, dtype: float64

In [66]:
train.groupby('Pclass')['Fare'].mean()

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64

In [67]:
train.loc[(train.Age.isnull())&(train.Name=='Mr'),'Age']=33
train.loc[(train.Age.isnull())&(train.Name=='Mrs'),'Age']=36
train.loc[(train.Age.isnull())&(train.Name=='Master'),'Age']=5
train.loc[(train.Age.isnull())&(train.Name=='Miss'),'Age']=22
train.loc[(train.Age.isnull())&(train.Name=='Other'),'Age']=45

test.loc[(test.Age.isnull())&(test.Name=='Mr'),'Age']=33
test.loc[(test.Age.isnull())&(test.Name=='Mrs'),'Age']=36
test.loc[(test.Age.isnull())&(test.Name=='Master'),'Age']=5
test.loc[(test.Age.isnull())&(test.Name=='Miss'),'Age']=22
test.loc[(test.Age.isnull())&(test.Name=='Other'),'Age']=45

test.loc[(test.Fare.isnull()),'Fare']= 14 # 아까 결측값 하나의 Pclass가 3이었음

In [68]:
train=train.join(pd.get_dummies(train['Name']))
test=test.join(pd.get_dummies(test['Name']))

In [69]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Master           0
Miss             0
Mr               0
Mrs              0
Other            0
dtype: int64

In [70]:
train.drop('Name',axis=1, inplace=True)
test.drop('Name',axis=1, inplace=True)

In [71]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Master,Miss,Mr,Mrs,Other
0,892,3,34.5,0,0,330911,7.8292,,0,1,0,1,0,0,0,1,0,0
1,893,3,47.0,1,0,363272,7.0,,1,0,0,0,1,0,0,0,1,0
2,894,2,62.0,0,0,240276,9.6875,,0,1,0,1,0,0,0,1,0,0
3,895,3,27.0,0,0,315154,8.6625,,0,1,0,0,1,0,0,1,0,0
4,896,3,22.0,1,1,3101298,12.2875,,1,0,0,0,1,0,0,0,1,0


5. Ticket: 티켓 번호

In [72]:
len(train.columns)

19

In [73]:
train.drop('Ticket', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)

In [74]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Master,Miss,Mr,Mrs,Other
0,892,3,34.5,0,0,7.8292,,0,1,0,1,0,0,0,1,0,0
1,893,3,47.0,1,0,7.0,,1,0,0,0,1,0,0,0,1,0
2,894,2,62.0,0,0,9.6875,,0,1,0,1,0,0,0,1,0,0
3,895,3,27.0,0,0,8.6625,,0,1,0,0,1,0,0,1,0,0
4,896,3,22.0,1,1,12.2875,,1,0,0,0,1,0,0,0,1,0


6. Cabin

In [75]:
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [76]:
test.head(30)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Master,Miss,Mr,Mrs,Other
0,892,3,34.5,0,0,7.8292,0,1,0,1,0,0,0,1,0,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1,0,0,0,1,0
2,894,2,62.0,0,0,9.6875,0,1,0,1,0,0,0,1,0,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1,0,0,1,0,0
4,896,3,22.0,1,1,12.2875,1,0,0,0,1,0,0,0,1,0
5,897,3,14.0,0,0,9.225,0,1,0,0,1,0,0,1,0,0
6,898,3,30.0,0,0,7.6292,1,0,0,1,0,0,1,0,0,0
7,899,2,26.0,1,1,29.0,0,1,0,0,1,0,0,1,0,0
8,900,3,18.0,0,0,7.2292,1,0,1,0,0,0,0,0,1,0
9,901,3,21.0,2,0,24.15,0,1,0,0,1,0,0,1,0,0


In [77]:
train['Family']=(train['SibSp']+train['Parch']+1)
test['Family']=(test['SibSp']+test['Parch']+1)

In [78]:
train.drop('SibSp', axis=1, inplace=True)
train.drop('Parch', axis=1, inplace=True)

test.drop('SibSp', axis=1, inplace=True)
test.drop('Parch', axis=1, inplace=True)

In [79]:
train['Fare']=train['Fare'].astype('int')
train['Age']=train['Age'].astype('int')

In [80]:
test['Fare']=test['Fare'].astype('int')
test['Age']=test['Age'].astype('int')

In [81]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Master,Miss,Mr,Mrs,Other,Family
0,892,3,34,7,0,1,0,1,0,0,0,1,0,0,1
1,893,3,47,7,1,0,0,0,1,0,0,0,1,0,2
2,894,2,62,9,0,1,0,1,0,0,0,1,0,0,1
3,895,3,27,8,0,1,0,0,1,0,0,1,0,0,1
4,896,3,22,12,1,0,0,0,1,0,0,0,1,0,3


In [82]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Master,Miss,Mr,Mrs,Other,Family
0,1,0,3,22,7,0,1,0,0,1,0,0,1,0,0,2
1,2,1,1,38,71,1,0,1,0,0,0,0,0,1,0,2
2,3,1,3,26,7,1,0,0,0,1,0,1,0,0,0,1
3,4,1,1,35,53,1,0,0,0,1,0,0,0,1,0,2
4,5,0,3,35,8,0,1,0,0,1,0,0,1,0,0,1


In [83]:
t = test['PassengerId']
train.drop('PassengerId', axis=1, inplace=True)
test.drop('PassengerId', axis=1, inplace=True)

In [84]:
y = train['Survived']
x = train.drop('Survived', axis=1)

In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=100)

In [86]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [87]:
estimator = lgb.LGBMClassifier(learning_rate = 0.125, metric = 'l1', 
                        n_estimators = 20, num_leaves = 38)

param_grid = {
    'n_estimators': [x for x in range(20, 40, 2)],
    'learning_rate': [0.05, 0.10, 0.125, 0.15, 0.175, 0.2],
    'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }
gridsearch = GridSearchCV(estimator, param_grid)

gridsearch.fit(X_train, Y_train)
y_pred = gridsearch.predict(X_test)
print(accuracy_score(Y_test, y_pred))



0.8246268656716418


In [92]:
kfold = StratifiedKFold(n_splits=10) # n_splits=10, kernel = without poly

SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(X_train,Y_train)

SVMC_best = gsSVMC.best_estimator_

# Best score
print(gsSVMC.best_score_)

y_pred = SVMC_best.predict(X_test)
print(accuracy_score(Y_test, y_pred))

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done 273 out of 280 | elapsed:   15.3s remaining:    0.3s
[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   15.6s finished


0.797752808988764
0.7985074626865671


In [99]:
xgb_clf = XGBClassifier()
param_grid = {
    'n_estimators': [x for x in range(20, 40, 2)],
    'learning_rate': [0.05, 0.10, 0.125, 0.15, 0.175, 0.2],
    'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }
gridsearch = GridSearchCV(xgb_clf, param_grid)
gridsearch.fit(X_train,Y_train)
xgb_clf = gridsearch.best_estimator_
y_pred=xgb_clf.predict(X_test)
print(accuracy_score(Y_test,y_pred))



0.8097014925373134


In [111]:
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_leaf=1, random_state=0)
#dt_clf = DecisionTreeClassifier(max_depth=20)
#svm_clf = SVC(random_state=42)

In [112]:
rf_clf2

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [113]:
rf_clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [x for x in range(100, 200, 10)],
    'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15],
    'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10]}
gridsearch = GridSearchCV(rf_clf, param_grid)
gridsearch.fit(X_train,Y_train)
rf_clf2 = gridsearch.best_estimator_
y_pred=rf_clf2.predict(X_test)
print(accuracy_score(Y_test,y_pred))



0.8246268656716418


In [122]:
voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf2), ('lgbm',  gridsearch)],
    voting='soft')

In [123]:
voting_clf.fit(X_train, Y_train)



VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=8,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=9,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
             

In [119]:
prediction = voting_clf.predict(test)

In [116]:
prediction = rf_clf2.predict(test)

In [120]:
submission = pd.DataFrame({
    'PassengerId': t,
    'Survived' : prediction
})

In [121]:
submission.to_csv('submission_rf_clf.csv',index=False)

In [None]:
rf_clf.fit(X_train, Y_train)

In [None]:
dt_clf.fit(X_train, Y_train)

In [None]:
y_pred_rf= rf_clf.predict(X_test)

In [None]:
y_pred_dt= dt_clf.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(Y_test, y_pred_rf)
dt_accuracy = accuracy_score(Y_test, y_pred_dt)
#print(rf_accuracy)
print(f'Mean accuracy score: {rf_accuracy:.3}')
print(f'Mean accuracy score: {dt_accuracy:.3}')