In [1]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import random

In [2]:
train = pd.read_csv('타이타닉 data/train.csv')
test = pd.read_csv('타이타닉 data/test.csv')

In [3]:
from datetime import datetime
# 1. Logistic regression
from sklearn.linear_model import LogisticRegression
# 2. Decision tree
from sklearn.tree import DecisionTreeClassifier
# 3. Support vector machine
from sklearn.svm import SVC
# 4. Gaussian naive bayes
from sklearn.naive_bayes import GaussianNB
# 5. K nearest neighbor
from sklearn.neighbors import KNeighborsClassifier
# 6. Random forest
from sklearn.ensemble import RandomForestClassifier
# 7. Gradient boosing
from sklearn.ensemble import GradientBoostingClassifier
# 8. Neural network
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
from xgboost import XGBClassifier # decision tree 앙상블 모델, 부스팅
# [light gbm 장점, 하이퍼파라미터 튜닝 방안](https://ariz1623.tistory.com/209)
# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능.
# [XGBoost와 LightGBM 하이퍼파라미터 튜닝 가이드](https://psystat.tistory.com/131)


from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyClassifier

# 모델 평가 지표 scoring metrics
from sklearn.model_selection import cross_val_score # model 검증

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

import ast # convert string to function

# [scikit-learn classifier metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)
# [classification metrics 1](https://hong-yp-ml-records.tistory.com/29)
# [classification metrics 2](https://sw-data.tistory.com/23) -> regression metrics 참고

In [10]:
train.columns
test.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# train.mean(axis = 0)
# train.fillna(train.mean(), axis = 0)
train = train.fillna(train.mean(axis = 0))

In [35]:
train['Cabin'].value_counts()
train['Embarked'].value_counts()

B96 B98        4
C23 C25 C27    4
G6             4
E101           3
F33            3
              ..
B50            1
C50            1
A19            1
D15            1
D45            1
Name: Cabin, Length: 147, dtype: int64

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
train['Embarked'] = train['Embarked'].fillna('S')

In [7]:
train['Cabin'] = train['Cabin'].fillna('NaN')

In [8]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [9]:
# Sex ratio
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
# Sex to int
train['Sex'] = train['Sex'].map({'male':0, 'female':1})
test['Sex'] = test['Sex'].map({'male':0, 'female':1})

In [13]:
# Embarked to int
train['Embarked'] = train['Embarked'].map({'S':1, 'C':2, 'Q':3})
test['Embarked'] = test['Embarked'].map({'S':1, 'C':2, 'Q':3})

In [15]:
train['Embarked'].unique()
# train['Ticket'].unique()
# train['Cabin'].unique()

array([1, 2, 3], dtype=int64)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [16]:
X_train = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] # 'Ticket', 'Cabin'
y_train = train[['Survived']]
X_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]


In [17]:
# seed 고정
user_seed = 0
random.seed(user_seed) # seed 고정
LGBM = lgb.LGBMClassifier(random_state = user_seed)

# model = LGBM.fit(X_train, y_train,
#                             eval_set=[(X_test, y_test)],
#                             eval_metric='auc',
#                             early_stopping_rounds=5) # map: mean average precision
model = LGBM.fit(X_train, y_train)
# eval_metric = logloss, auc, error
y_pred_test = model.predict(X_test, num_iteration=LGBM.best_iteration_) # 예측

# accuracy = accuracy_score(y_test, y_pred_test)
# roc_auc = roc_auc_score(y_test, y_pred_test)

## 분류는 input으로 수치형만 가능한가봄...
# DataFrame.dtypes for data must be int, float or bool.
# Did not expect the data types in the following fields: Sex, Ticket, Cabin, Embarked

In [18]:
findata = pd.concat([test['PassengerId'], pd.DataFrame(y_pred_test)], axis = 1)

In [19]:
findata.columns = ['PassengerId', 'Survived']


In [20]:
findata

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1


In [21]:
findata.to_csv('2nd_submission.csv', index = False)

In [None]:
results = []
def my_classifier():
    
    # X, y
    X = d1221.iloc[:,2:-1] # 1221 데이터
    y = d1221.iloc[:,-1]
    
    # seed 고정
    user_seed = 0
    random.seed(user_seed) # seed 고정
    
    # 모델
    # (같은 줄에 써놓은 애들끼리 하이퍼파리미터 구성 비슷하거나 같음)
    Logistic_Regression = LogisticRegression(n_jobs=64,
                                             random_state=user_seed,
                                             max_iter=10) # , verbose = 1
    # [모델훈련](https://taek98.tistory.com/15)

    DecisionTree = DecisionTreeClassifier(max_depth = 10,
                                          random_state=user_seed,
                                          max_leaf_nodes=4)
    
    RandomForest = RandomForestClassifier(n_estimators=10,
                                          max_depth=10,
                                          random_state=user_seed,
                                          max_leaf_nodes=4,
                                          n_jobs=64) # , verbose=1
    
    GradientBoosting = GradientBoostingClassifier(n_estimators=10,
                                                  max_depth = 10,
                                                  random_state = user_seed,
                                                  max_leaf_nodes=4,
                                                  learning_rate=0.05) # , verbose = 1
    
    SVM = SVC(max_iter=1000,
              random_state=user_seed) # , verbose=True
    
    Gaussian_NB = GaussianNB()
    KNeighbors = KNeighborsClassifier(n_jobs = 64)
    MLP = MLPClassifier(max_iter=1000,
                        batch_size = 10000,
                        shuffle=True,
                        random_state = user_seed,
                        early_stopping=True) # early_stopping 있어서 max_iter 키움., verbose=True

    XGBoost = XGBClassifier(n_jobs=64,
                            max_depth=10,
                            n_estimators=10,
                            learning_rate=0.05,
                            random_state = user_seed) # early_stopping_rounds = 50,
    # [XGBoost Classifier hyper params](https://xgboost.readthedocs.io/en/stable/python/python_api.html)
    LGBM = lgb.LGBMClassifier(num_leaves=15,
                              learning_rate=0.05,
                              n_estimators=10,
                              max_depth = 10,
                              random_state = user_seed) # early_stopping_rounds, best_iteration_,
    # max_depth : 과적합 방지를 위해 깊이 크기 제한
    # n_estimators : 너무 크면 과적합, 성능저하
    # learning_rate 작게 하면서 n_estimators를 크게 하는 것은 부스팅 계열 튜닝에서 가장 기본적인 튜닝 방안이므로 이를 적용하는 것도 좋다.
    # [LGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html)
    
    # [gbm grid CV, best_iteration_ 참고](https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
    # [how to save model to best iteration if early stopping does not happen?](https://github.com/Microsoft/LightGBM/issues/1683)
    
    dummy = DummyClassifier(strategy = 'stratified', random_state = user_seed) # strategy 옵션: 'stratified', 'uniform', 'most_frequent'
    
    my_model_list = ['MLP', 'DecisionTree', 'RandomForest', 'GradientBoosting',
                     'XGBoost', 'dummy', 'Logistic_Regression', 'SVM', 'Gaussian_NB'] # 실수로 빼먹ㅇ서 다시
    # 'LGBM', 'KNeighbors' 서버 자꾸 터져서 뺍니다
    # , 끝나서 뺍니다.
    
    i = 0
    
    ######################################################################### fitting
    for model_nm in tqdm(my_model_list):
        print(f'<<<---{model_nm} start--->>>')
        print(datetime.now().strftime('%H:%M:%S'))
        cntcnt = 0
        # cv # 년도 나눠지는 것에 따라서 개수 정함.
        for slicer in slicing_dic:
            print(f'<<<---{slicer} start--->>>')
            print(datetime.now().strftime('%H:%M:%S'))
            tmpX = X[slicing_dic[slicer][0]:slicing_dic[slicer][1]].reset_index(drop = True) # X를 연속 4년치 꺼내서 reset_index(drop = True)
            tmpy = y[slicing_dic[slicer][0]:slicing_dic[slicer][1]].reset_index(drop = True) # y를 연속 4년치 꺼내서 reset_index(drop = True)
#             tmp_all = d1221[slicing_dic[slicer][0]:slicing_dic[slicer][1]].reset_index(drop = True) # 전체데이터

            X_train = tmpX[:-slicing_dic[slicer][2]] # 기준년도 앞
            X_test = tmpX[-slicing_dic[slicer][2]:] # 기준년도 이하
            y_train = tmpy[:-slicing_dic[slicer][2]] # 기준년도 앞
            y_test = tmpy[-slicing_dic[slicer][2]:] # 기준년도 이하
            
            # train끼리 idx 같아야 하므로
            len_train = slicing_dic[slicer][1]-(slicing_dic[slicer][2]+slicing_dic[slicer][0]) # 혹은 tmp_train.shape[0]
            len(list(range(len_train)))
            train_idx = list(range(len_train))
            random.shuffle(train_idx) # 자동으로 덮어쓰기
            
            # test끼리 idx 같아야 하므로
            len_train_to_test = slicing_dic[slicer][1]-slicing_dic[slicer][0]
            test_idx = list(range(len_train,len_train_to_test)) # +1 안해줘도 되는 거 위 cell에서 확인
            len(test_idx)
            random.shuffle(test_idx) # 자동으로 덮어쓰기
            
            X_train = X_train.loc[train_idx]
            y_train = y_train.loc[train_idx]
            X_test = X_test.loc[test_idx]
            y_test = y_test.loc[test_idx]
            ###########################################
            y_train = np.array(y_train).reshape(-1,1) # 1열짜리로 만드는 것
            y_test = np.array(y_test).reshape(-1,1)

            ######################################################## cv를 위한 데이터 slicing 작업 끝

            # fitting은 cv 안에 넣기
            if model_nm == 'LGBM':
                model_ = eval(model_nm).fit(X_train, y_train,
                                            eval_set=[(X_test, y_test)],
                                            eval_metric='auc',
                                            early_stopping_rounds=5) # map: mean average precision
                # eval_metric = logloss, auc, error
                y_pred_test = model_.predict(X_test, num_iteration=LGBM.best_iteration_) # 예측
            # If early stopping occurs, the model will have three additional fields: best_score, best_iteration and best_ntree_limit
            # [eval_metric](https://stats.stackexchange.com/questions/493981/xgboost-mean-average-precision-eval-metric-for-classification)
            # [gbm.fit verbose = 10, #verbose = 10 : 10번 반복할 때마다 logloss값을 보여준다.](https://dacon.io/codeshare/1827)
            # [Classifier, Regressor eval_metric 예시](https://hwi-doc.tistory.com/entry/%EC%9D%B4%ED%95%B4%ED%95%98%EA%B3%A0-%EC%82%AC%EC%9A%A9%ED%95%98%EC%9E%90-XGBoost)
            # [XGBoost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html#)
            # error: For MAP metric, there should be query information
            elif model_nm == 'XGBoost': # xgboost_linear가 느려서
                cntcnt += 1
                print(f'$$$$$---{cntcnt}')
                model_ = eval(model_nm).fit(X_train, y_train,
                                            eval_set=[(X_test, y_test)],
                                            eval_metric='auc',
                                            early_stopping_rounds=5) # ,verbose = True
                y_pred_test = model_.predict(X_test) # 예측
            
            else:
                cntcnt += 1
                print(f'$$$$$---{cntcnt}')
                model_ = eval(model_nm).fit(X_train, y_train)
                y_pred_test = model_.predict(X_test) # 예측

            accuracy = accuracy_score(y_test, y_pred_test)
            roc_auc = roc_auc_score(y_test, y_pred_test)
#             f1 = f1_score(y_test, y_pred_test)
#             jaccard = jaccard_score(y_test, y_pred_test)
#             cv_scores = cross_val_score(eval(model_nm), tmpX, tmpy, cv=7, n_jobs = 64)
            
#             _ = dummy.fit(X_train, y_train)
#             dummy_score = dummy.score(X_test, y_test)

            results.append([i, f'{model_nm}', accuracy, roc_auc]) # , f1, jaccard, dummy_score, cv_scores
            i+=1
        ################################################################
        print(results[-7:])
    return results#score_dic#[0]