<a href="https://colab.research.google.com/github/BaekKyunShin/Kaggle-Competition-Including-Dacon/blob/master/Novel_Writer_Classification/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM

In [73]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [74]:
import pandas as pd
import numpy as np

# nltk
import nltk
from nltk.corpus import stopwords 

# 모델링
import lightgbm as gbm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

# 기타
import os
import re
import easydict
import gc

In [75]:
# 전역변수 설정
args = easydict.EasyDict({ 'chdir': '/content/gdrive/My Drive/colab/Dacon_Novel_Writer_Classification/',
                          'train_dir': 'open/train.csv', 
                          'test_dir': 'open/test_x.csv',
                          'submission_dir': 'open/sample_submission.csv',
                          'fianl_submission_dir': 'open/submission.csv',
                          'max_len': 100,
                          'epochs': 1,
                          'learning_rate': 0.01,
                          'wd': 1e-5,
                          'batch_size': 64,
                          'train': True})

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [77]:
# 경로 설정
os.chdir(args.chdir)

# 파일 불러오기
train = pd.read_csv(args.train_dir, encoding='utf-8')
test = pd.read_csv(args.test_dir, encoding='utf-8')
sample_submission = pd.read_csv(args.submission_dir, encoding='utf-8')

train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [78]:
train.head(2)

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2


In [79]:
test.head(2)

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."


In [80]:
sample_submission.head(2)

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0


# Preprocessing

In [81]:
# 불용어
basic_stopwords = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

final_stopwords = nltk_stopwords.union(basic_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
# 부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# 불용어 제거해주는 함수
def remove_stopwords(text: str):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in final_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)
    
# 전처리 적용
train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

# Vectorizing

In [83]:
# Count Vectorizer
count_vec=CountVectorizer(ngram_range=(1,2), min_df=5, binary=True, max_features=3000) 
train_cv=count_vec.fit_transform(train['text'])
print("CountVectorized train dataset shape: ", train_cv.toarray().shape)

# CSR 형식으로 변경
X_cv = train_cv.tocsr().toarray()

# Test Data에도 동일하게 적용
test_cv = count_vec.transform(test['text'])
test_cv = test_cv.tocsr().toarray()

(54879, 3000)

In [87]:
# TF-IDF Vectorizer
tfidf_vec=TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.4, sublinear_tf=True, norm='l2', max_features=5500) 
train_tfidf=tfidf_vec.fit_transform(train['text'])
print("TF-IDF Vectorized train dataset shape: ", train_tfidf.toarray().shape)

# CSR 형식으로 변경
X_tfidf = train_tfidf.tocsr().toarray()

# Test Data에도 동일하게 적용
test_tfidf = tfidf_vec.transform(test['text'])
test_tfidf = test_tfidf.tocsr().toarray()



(54879, 5500)

In [89]:
y = train['author']
dtest = test['text']

In [90]:
Dparam = {'objective' : 'multiclass',
          'boosting_type': 'gbdt',
          'num_class': 5,
          'metric' : 'multi_logloss',
          #'max_bin':350,
          'max_depth':20,
          'min_child_weight': 8,
          'bagging_fraction':0.75,
          'feature_fraction':0.75,
          'lambda_l1':0.1,
          'lambda_l2':0.1,
          'num_leaves':31} 

In [91]:
print("Training Model")

folds = KFold(n_splits=5, shuffle=True, random_state=50001)
oof_preds = np.zeros((train.shape[0], 5))
sub_preds = np.zeros((test.shape[0], 5))

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train1)):
    dtrain =gbm.Dataset(train1[trn_idx], y[trn_idx])
    dval =gbm.Dataset(train1[val_idx], y[val_idx])
    m_gbm=gbm.train(params=Dparam,train_set=dtrain,num_boost_round=300,verbose_eval=50,valid_sets=[dtrain,dval],valid_names=['train','valid'])
    oof_preds[val_idx] = m_gbm.predict(train1[val_idx])
    sub_preds += m_gbm.predict(test1) / folds.n_splits
    print('Fold %2d log_loss : %.6f' % (n_fold + 1, log_loss(y.iloc[val_idx],oof_preds[val_idx])))
    del dtrain,dval
    gc.collect()
    
print('Full log_loss score %.6f' % log_loss(y, oof_preds))   


Training Model
[50]	train's multi_logloss: 1.08529	valid's multi_logloss: 1.10801
[100]	train's multi_logloss: 0.964401	valid's multi_logloss: 1.01106
[150]	train's multi_logloss: 0.895627	valid's multi_logloss: 0.959538
[200]	train's multi_logloss: 0.847641	valid's multi_logloss: 0.926687
[250]	train's multi_logloss: 0.810251	valid's multi_logloss: 0.903244
[300]	train's multi_logloss: 0.780516	valid's multi_logloss: 0.886156
Fold  1 log_loss : 0.886156
[50]	train's multi_logloss: 1.08003	valid's multi_logloss: 1.12726
[100]	train's multi_logloss: 0.958278	valid's multi_logloss: 1.03483
[150]	train's multi_logloss: 0.889107	valid's multi_logloss: 0.985649
[200]	train's multi_logloss: 0.840695	valid's multi_logloss: 0.953136
[250]	train's multi_logloss: 0.803812	valid's multi_logloss: 0.9304
[300]	train's multi_logloss: 0.773727	valid's multi_logloss: 0.91317
Fold  2 log_loss : 0.913170
[50]	train's multi_logloss: 1.08129	valid's multi_logloss: 1.1227
[100]	train's multi_logloss: 0.960

In [98]:
sample_submission[['0','1','2','3','4']] = sub_preds
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.087913,0.743220,0.087455,0.065654,0.015758
1,1,0.267783,0.257381,0.036826,0.081835,0.356175
2,2,0.831855,0.044657,0.026703,0.011769,0.085016
3,3,0.043759,0.003249,0.537496,0.006582,0.408914
4,4,0.235752,0.193583,0.104647,0.209781,0.256237
...,...,...,...,...,...,...
19612,19612,0.006282,0.990532,0.000435,0.002100,0.000651
19613,19613,0.542395,0.011797,0.018213,0.005852,0.421744
19614,19614,0.034763,0.955440,0.001644,0.006246,0.001907
19615,19615,0.075968,0.738185,0.095894,0.073060,0.016893


In [99]:
sample_submission.to_csv(args.fianl_submission_dir, index=False, encoding = 'utf-8')