In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [4]:
df_train = pd.read_csv('../input/dankook/train.csv', index_col=0)
df_test = pd.read_csv('../input/dankook/test.csv', index_col=0)

## 학습/테스트 데이터 결합

In [5]:
dataset = pd.concat([df_train,df_test], axis=0)
df_train_len = len(df_train)

dataset.fillna(-1, inplace=True)

In [6]:
dataset['nObserve'] = dataset['nObserve'].apply(np.log1p)

In [7]:
dataset['d_dered_u'] = dataset['dered_u'] - dataset['u']
dataset['d_dered_g'] = dataset['dered_g'] - dataset['g']
dataset['d_dered_r'] = dataset['dered_r'] - dataset['r']
dataset['d_dered_i'] = dataset['dered_i'] - dataset['i']
dataset['d_dered_z'] = dataset['dered_z'] - dataset['z']
dataset['d_dered_rg'] = dataset['dered_r'] - dataset['dered_g']
dataset['d_dered_ig'] = dataset['dered_i'] - dataset['dered_g']
dataset['d_dered_zg'] = dataset['dered_z'] - dataset['dered_g']
dataset['d_dered_ri'] = dataset['dered_r'] - dataset['dered_i']
dataset['d_dered_rz'] = dataset['dered_r'] - dataset['dered_z']
dataset['d_dered_iz'] = dataset['dered_i'] - dataset['dered_z']
dataset['d_obs_det'] = dataset['nObserve'] - dataset['nDetect']

In [8]:
dataset.drop(['airmass_z', 'airmass_i', 'airmass_r', 'airmass_g', 'u', 'g', 'r', 'i', 'nDetect'], 
        axis=1, inplace=True)

## 데이터셋 분리

In [9]:
from sklearn.model_selection import train_test_split 

# train set
X = dataset.loc[ dataset.index < df_train_len , :]
X.drop(columns='class',inplace=True,axis=1)
y = dataset.loc[ dataset.index < df_train_len, 'class']
y.astype(int)

# test set
test = dataset.loc[ dataset.index >= df_train_len, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = SEED)

## 모델 학습

In [10]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_squared_error 


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

### sklearn.tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

In [12]:
df_parmas = {
    'max_features': 'sqrt',
    'random_state': 2020
}

et_parmas = {
    'max_features': 'sqrt',
    'random_state': 2020
}

In [13]:
dt_clf = DecisionTreeClassifier(**df_parmas)
dt_clf.fit(X_train,y_train)
p = dt_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.8964583333333334


In [14]:
et_clf = ExtraTreeClassifier(**et_parmas)
et_clf.fit(X_train,y_train)
p = et_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.8528958333333333


### sklearn.neighbors

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
knn_params = {
    'n_neighbors': 5,
    'n_jobs': -1
}

knn_clf=KNeighborsClassifier(**knn_params)
knn_clf.fit(X_train,y_train)
p = knn_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.87128125


### sklearn.svm

In [17]:
from sklearn.svm import SVC

In [18]:
# SVC
svc_params = {
    'random_state': 2020
}

svc_clf = make_pipeline(StandardScaler(), SVC(**svc_params))
svc_clf.fit(X_train,y_train)
p = svc_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.8991875


### sklearn.ensemble

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features' : 'sqrt',
    'verbose' : 0,
    'random_state': 2020
}

rf_clf = RandomForestClassifier(**rf_params)
rf_clf.fit(X_train,y_train)
p = rf_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.93065625


In [21]:
# Extra Trees Parameters
ets_params = {
    'n_jobs' : -1,
    'n_estimators': 100,
    'max_features' : 'sqrt',
    'verbose' : 0,
    'random_state': 2020
}

ets_clf = ExtraTreesClassifier(**ets_params)
ets_clf.fit(X_train,y_train)
p = ets_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.9194583333333334


In [22]:
# AdaBoost Parameters
ada_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'algorithm': 'SAMME.R',
    'random_state': 2020
}

ada_clf = AdaBoostClassifier(**ada_params)
ada_clf.fit(X_train,y_train)
p = ada_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.87384375


In [23]:
# Gradient Bossting parameters
gb_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_features': 'sqrt',
    'verbose': 0,
    'random_state': 2020
}

gb_clf = GradientBoostingClassifier(**gb_params)
gb_clf.fit(X_train,y_train)
p = ada_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.87384375


### xgboost

In [24]:
import xgboost as xg

In [25]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'gamma': 0,
    'tree_method': 'auto',
    'objective': 'reg:squarederror',
    'random_state': 2020
}

xgb_clf = xg.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.9315625


## 앙상블

In [26]:
from sklearn.ensemble import StackingClassifier

### version 1

In [27]:
layer_one_estimators = [
    ('df_1', DecisionTreeClassifier(**df_parmas)),
    ('et_1', ExtraTreeClassifier(**et_parmas)),
    ('knn_1', KNeighborsClassifier(**knn_params)),
    ('svc_1', make_pipeline(StandardScaler(), SVC(**svc_params)))
]

layer_two_estimators = [
    ('rf_2', RandomForestClassifier(**rf_params)),
    ('ets_2', ExtraTreesClassifier(**ets_params)),
    ('ada_2', AdaBoostClassifier(**ada_params)),
    ('gb_2', GradientBoostingClassifier(**gb_params))
]

layer_final_estimator = xg.XGBClassifier(**xgb_params)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_final_estimator, n_jobs=-1)

stacking_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, n_jobs=-1)

In [28]:
stacking_clf.fit(X_train,y_train)

StackingClassifier(estimators=[('df_1',
                                DecisionTreeClassifier(max_features='sqrt',
                                                       random_state=2020)),
                               ('et_1',
                                ExtraTreeClassifier(max_features='sqrt',
                                                    random_state=2020)),
                               ('knn_1', KNeighborsClassifier(n_jobs=-1)),
                               ('svc_1',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('svc',
                                                 SVC(random_state=2020))]))],
                   final_estimator=StackingClassifier(estimat...
                                                                                    learning_rate=None,
                                                                    

In [29]:
p = stacking_clf.predict(X_test)
print('accuracy for stacking :', accuracy_score(y_test,p))

accuracy for stacking : 0.9203958333333333


### verison2

In [30]:
layer_one_estimators = [
    ('df_1', DecisionTreeClassifier(**df_parmas)),
    ('svc_1', make_pipeline(StandardScaler(), SVC(**svc_params)))
]

layer_two_estimators = [
    ('rf_2', RandomForestClassifier(**rf_params)),
    ('ets_2', ExtraTreesClassifier(**ets_params)),
]

layer_final_estimator = xg.XGBClassifier(**xgb_params)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_final_estimator, n_jobs=-1)

stacking_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, n_jobs=-1)

In [31]:
stacking_clf.fit(X_train,y_train)

StackingClassifier(estimators=[('df_1',
                                DecisionTreeClassifier(max_features='sqrt',
                                                       random_state=2020)),
                               ('svc_1',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('svc',
                                                 SVC(random_state=2020))]))],
                   final_estimator=StackingClassifier(estimators=[('rf_2',
                                                                   RandomForestClassifier(max_features='sqrt',
                                                                                          n_jobs=-1,
                                                                                          random_state=2020)),
                                                                  ('ets_2',
                        

In [32]:
p = stacking_clf.predict(X_test)
print('accuracy for stacking :', accuracy_score(y_test,p))

accuracy for stacking : 0.9084479166666667


### version3

In [33]:
layer_one_estimators = [
    ('ada',AdaBoostClassifier(**ada_params)),
    ('gb',GradientBoostingClassifier(**gb_params)),
    ('svc_1', make_pipeline(StandardScaler(), SVC(**svc_params)))
]

layer_two_estimators = [
    ('rf_2', RandomForestClassifier(**rf_params)),
    ('ets_2', ExtraTreesClassifier(**ets_params)),
]

layer_final_estimator = xg.XGBClassifier(**xgb_params)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_final_estimator, n_jobs=-1)

stacking_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, n_jobs=-1)

In [34]:
stacking_clf.fit(X_train,y_train)

StackingClassifier(estimators=[('ada',
                                AdaBoostClassifier(learning_rate=0.1,
                                                   n_estimators=100,
                                                   random_state=2020)),
                               ('gb',
                                GradientBoostingClassifier(max_features='sqrt',
                                                           random_state=2020)),
                               ('svc_1',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('svc',
                                                 SVC(random_state=2020))]))],
                   final_estimator=StackingClassifier(estimators=[('rf_2',
                                                                   RandomForestCl...
                                                                                  

In [35]:
p = stacking_clf.predict(X_test)
print('accuracy for stacking :', accuracy_score(y_test,p))

accuracy for stacking : 0.9245729166666666


### version4

In [39]:
estimators = [
    ('df', DecisionTreeClassifier(**df_parmas)),
    ('et', ExtraTreeClassifier(**et_parmas)),
    ('knn', KNeighborsClassifier(**knn_params)),
    ('rf', RandomForestClassifier(**rf_params)),
    ('ets',ExtraTreesClassifier(**ets_params)),
    ('ada',AdaBoostClassifier(**ada_params)),
    ('gb',GradientBoostingClassifier(**gb_params)),
    ('svc', make_pipeline(StandardScaler(), SVC(random_state=2020)))
]
final_estimator = xg.XGBClassifier(**xgb_params)

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1)

In [40]:
stacking_clf.fit(X_train,y_train)

StackingClassifier(estimators=[('df',
                                DecisionTreeClassifier(max_features='sqrt',
                                                       random_state=2020)),
                               ('et',
                                ExtraTreeClassifier(max_features='sqrt',
                                                    random_state=2020)),
                               ('knn', KNeighborsClassifier(n_jobs=-1)),
                               ('rf',
                                RandomForestClassifier(max_features='sqrt',
                                                       n_jobs=-1,
                                                       random_state=2020)),
                               ('ets',
                                ExtraTreesClassifier(max_features='sqrt',
                                                     n_jobs=-1,
                                                     random_...
                                                 interac

In [41]:
p = stacking_clf.predict(X_test)
print('accuracy for stacking :', accuracy_score(y_test,p))

accuracy for stacking : 0.9327604166666666


In [42]:
# 0.93562 제출파일

stacking_clf.fit(X,y)
p = stacking_clf.predict(test)
submission = pd.read_csv('../input/dankook/sample_submission.csv')
submission['class'] = p
submission.to_csv('submission.csv',index=False, encoding='utf-8-sig')