# XGBoost

In [1]:
import pandas as pd
from xgboost import XGBClassifier

In [2]:
# 데이터 불러오기
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# 데이터 확인
train.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


In [4]:
# 웟-핫 인코딩
train_one = pd.get_dummies(train)
test_one = pd.get_dummies(test)

train_one.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_red,type_white
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,0,1
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,1,0
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,0,1
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,0,1
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,0,1


In [5]:
# 모델 객체 생성
model = XGBClassifier()

In [6]:
# x, y 변수 설정
x = train_one.drop('quality', axis=1)
y = train_one['quality']

In [7]:
# 모델 학습
model.fit(x, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
# test 데이터 품질 예측
pred = model.predict(test_one)

In [9]:
# sample_submission.csv에 예측값 채우기
submission = pd.read_csv('data/sample_submission.csv')
submission['quality'] = pred
submission.to_csv('xgb_pred.csv', index=False)
submission.head()

Unnamed: 0,index,quality
0,0,6
1,1,5
2,2,5
3,3,5
4,4,7


# Light GBM (LGBM)

In [10]:
from lightgbm import LGBMClassifier

# 모델 정의
model = LGBMClassifier()

In [11]:
# 모델 학습
model.fit(x, y)

LGBMClassifier()

In [12]:
# 품질 예측
pred = model.predict(test_one)

In [14]:
# 결과 출력
result = pd.DataFrame(pred)
result.columns = ['quality']
result.head()

Unnamed: 0,quality
0,5
1,5
2,5
3,5
4,7


# Stratified k-fold

In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# stratified k-fold로 train set/validation set 나누기
skf = StratifiedKFold(n_splits=5)

cnt, acc = 1, 0

for train_idx, valid_idx in skf.split(x, y):
    train_data = train_one.iloc[train_idx]
    valid_data = train_one.iloc[valid_idx]
    
    # LGBM 모델 객체 생성
    model = LGBMClassifier()

    # 훈련셋 데이터
    train_x = train_data.drop('quality', axis=1)
    train_y = train_data['quality']
    
    # 모델 학습
    model.fit(train_x, train_y)
    
    # 검증셋 데이터 
    valid_x = valid_data.drop('quality', axis=1)
    valid_y = valid_data['quality']
    
    # 검증셋으로 품질 예측
    pred = model.predict(valid_x)
    
    # 모델 정확도 출력
    print(cnt, ' 번째 모델 정확도: ', accuracy_score(pred, valid_y))
    acc += accuracy_score(pred, valid_y)
    cnt += 1
    
print('모델 정확도 평균: ', acc/5)

1  번째 모델 정확도:  0.4536363636363636
2  번째 모델 정확도:  0.5790909090909091
3  번째 모델 정확도:  0.6196542311191993
4  번째 모델 정확도:  0.6105550500454959
5  번째 모델 정확도:  0.5959963603275705
모델 정확도 평균:  0.5717865828439076


# Voting Classifier

In [24]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

In [27]:
# 모델 객체 생성
LGBM = LGBMClassifier()
XGB = XGBClassifier()
RF = RandomForestClassifier()

VC = VotingClassifier(estimators=[('rf',RF),('xgb',XGB),('lgbm',LGBM)],voting='soft')

In [33]:
# 모델 학습
VC.fit(x, y)





VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                 

In [34]:
# 모델 예측 및 예측값 출력
pred = VC.predict(test_one)
pred = pd.DataFrame(pred)
pred.head()

Unnamed: 0,0
0,6
1,5
2,5
3,5
4,7
