In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [4]:
df_train = pd.read_csv('../input/dankook/train.csv', index_col=0)
df_test = pd.read_csv('../input/dankook/test.csv', index_col=0)

## 학습/테스트 데이터 결합

In [5]:
dataset = pd.concat([df_train,df_test], axis=0)
df_train_len = len(df_train)

dataset.fillna(-1, inplace=True)

In [6]:
dataset['nObserve'] = dataset['nObserve'].apply(np.log1p)

In [7]:
dataset['d_dered_u'] = dataset['dered_u'] - dataset['u']
dataset['d_dered_g'] = dataset['dered_g'] - dataset['g']
dataset['d_dered_r'] = dataset['dered_r'] - dataset['r']
dataset['d_dered_i'] = dataset['dered_i'] - dataset['i']
dataset['d_dered_z'] = dataset['dered_z'] - dataset['z']
dataset['d_dered_rg'] = dataset['dered_r'] - dataset['dered_g']
dataset['d_dered_ig'] = dataset['dered_i'] - dataset['dered_g']
dataset['d_dered_zg'] = dataset['dered_z'] - dataset['dered_g']
dataset['d_dered_ri'] = dataset['dered_r'] - dataset['dered_i']
dataset['d_dered_rz'] = dataset['dered_r'] - dataset['dered_z']
dataset['d_dered_iz'] = dataset['dered_i'] - dataset['dered_z']
dataset['d_obs_det'] = dataset['nObserve'] - dataset['nDetect']

In [8]:
dataset.drop(['airmass_z', 'airmass_i', 'airmass_r', 'airmass_g', 'u', 'g', 'r', 'i', 'nDetect'], 
        axis=1, inplace=True)

## 데이터셋 분리

In [9]:
from sklearn.model_selection import train_test_split 

# train set
X = dataset.loc[ dataset.index < df_train_len , :]
X.drop(columns='class',inplace=True,axis=1)
y = dataset.loc[ dataset.index < df_train_len, 'class']

# test set
test = dataset.loc[ dataset.index >= df_train_len, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = SEED)

## 모델 학습

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
import xgboost as xg

from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_squared_error 


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [11]:
# Put in our parameters for said classifiers

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features' : 'sqrt',
    'verbose' : 0,
    'random_state': 2020
}

# Extra Trees Parameters
et_params = {
    'n_jobs' : -1,
    'n_estimators': 100,
    'max_features' : 'sqrt',
    'verbose' : 0,
    'random_state': 2020
}

# AdaBoost Parameters
ada_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'algorithm': 'SAMME.R',
    'random_state': 2020
}

# Gradient Bossting parameters
gb_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_features': 'sqrt',
    'verbose': 0,
    'random_state': 2020
}

# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'gamma': 0,
    'tree_method': 'auto',
    'objective': 'reg:squarederror',
    'random_state': 2020
}

# SVC
svc_params = {
    'random_state': 2020
}

In [12]:
estimators = [
    ('rf', RandomForestClassifier(**rf_params)),
    ('et',ExtraTreesClassifier(**et_params)),
    ('ada',AdaBoostClassifier(**ada_params)),
    ('gb',GradientBoostingClassifier(**gb_params)),
    ('svc', make_pipeline(StandardScaler(), SVC(random_state=2020)))
]

model = StackingClassifier(estimators=estimators, final_estimator=xg.XGBClassifier(**xgb_params), n_jobs=-1)

In [13]:
model.fit(X_train,y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(max_features='sqrt',
                                                       n_jobs=-1,
                                                       random_state=2020)),
                               ('et',
                                ExtraTreesClassifier(max_features='sqrt',
                                                     n_jobs=-1,
                                                     random_state=2020)),
                               ('ada',
                                AdaBoostClassifier(learning_rate=0.1,
                                                   n_estimators=100,
                                                   random_state=2020)),
                               ('gb',
                                GradientBoostingClassifier(max_features='sqrt',
                                                           random_state=2020)),
                               ('svc',
       

In [14]:
p = model.predict(X_test)
print('accuracy for stacking :', accuracy_score(y_test,p))

accuracy for stacking : 0.932875


In [20]:
p = model.predict(test)
submission = pd.read_csv('../input/dankook/sample_submission.csv')
submission['class'] = p
submission.head()

Unnamed: 0,id,class
0,320000,2.0
1,320001,0.0
2,320002,2.0
3,320003,0.0
4,320004,2.0


In [22]:
# 0.934525 제출파일

submission.to_csv('submission.csv',index=False, encoding='utf-8-sig')

In [23]:
model.fit(X,y)
p = model.predict(test)
submission = pd.read_csv('../input/dankook/sample_submission.csv')
submission['class'] = p
submission.head()

Unnamed: 0,id,class
0,320000,2.0
1,320001,0.0
2,320002,2.0
3,320003,0.0
4,320004,2.0


In [24]:
# 0.93555 제출파일

submission.to_csv('submission.csv',index=False, encoding='utf-8-sig')