<a href="https://colab.research.google.com/github/BaekKyunShin/Kaggle-Competition-Including-Dacon/blob/master/Novel_Writer_Classification/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM

function ClickConnect() { var buttons = document.querySelectorAll("colab-dialog.yes-no-dialog paper-button#cancel"); buttons.forEach(function(btn) { btn.click(); }); console.log("10분마다 자동 재연결"); document.querySelector("#top-toolbar > colab-connect-button").click(); } setInterval(ClickConnect,1000*600)

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

# nltk
import nltk
from nltk.corpus import stopwords 

# 모델링
import lightgbm as gbm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

# 기타
import os
import re
import easydict
import gc

In [None]:
# 전역변수 설정
args = easydict.EasyDict({ 'chdir': '/content/gdrive/My Drive/colab/Dacon_Novel_Writer_Classification/',
                          'train_dir': 'open/train.csv', 
                          'test_dir': 'open/test_x.csv',
                          'submission_dir': 'open/sample_submission.csv',
                          'fianl_submission_dir': 'open/submission.csv',

                          'vectorizer': 'count',
                          'num_boost_round': 20000,
                          'early_stopping_rounds': 50,
                          'max_len': 100,
                          'epochs': 1,
                          'learning_rate': 0.01,
                          'max_features': 3000,
                          'wd': 1e-5,
                          'batch_size': 64,
                          'folds': KFold(n_splits=5, shuffle=True, random_state=1991),
                          })

In [None]:
# 경로 설정
os.chdir(args.chdir)

# 파일 불러오기
train = pd.read_csv(args.train_dir, encoding='utf-8')
test = pd.read_csv(args.test_dir, encoding='utf-8')
sample_submission = pd.read_csv(args.submission_dir, encoding='utf-8')

train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [None]:
train.head(2)

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2


In [None]:
test.head(2)

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."


In [None]:
sample_submission.head(2)

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0


# Preprocessing

In [None]:
# 불용어
basic_stopwords = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

final_stopwords = nltk_stopwords.union(basic_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# 불용어 제거해주는 함수
def remove_stopwords(text: str):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in final_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)
    
# 전처리 적용
train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
# 단어 개수에 대한 feature 추가

train['num_words'] = 0
train_num_words_list = []
for i, row in train.iterrows():
    num_words = len(row['text'].split())
    train_num_words_list.append(num_words)

test['num_words'] = 0
test_num_words_list = []
for i, row in test.iterrows():
    num_words = len(row['text'].split())
    test_num_words_list.append(num_words)

train['num_words'] = train_num_words_list
test['num_words'] = test_num_words_list

# Vectorizing

In [None]:
# Count Vectorizer
count_vec = CountVectorizer(ngram_range=(1,2), min_df=5, binary=True, max_features=args.max_features) 
train_cv = count_vec.fit_transform(train['text'])
# train_cv = train_cv.concat(train['num_words'])
train_cv = pd.DataFrame(train_cv.tocsr().toarray())
train_cv['num_words'] = train['num_words']
X_cv = train_cv

# CSR 형식으로 변경
# X_cv = train_cv.tocsr().toarray()
print("CountVectorized train dataset shape: ", X_cv.shape)

# Test Data에도 동일하게 적용
test_cv = count_vec.transform(test['text'])
# test_cv = test_cv.concat(test['num_words'])
test_cv = pd.DataFrame(test_cv.tocsr().toarray())
test_cv['num_words'] = test['num_words']

# test_cv = test_cv.tocsr().toarray()
print("CountVectorized test dataset shape: ", test_cv.shape)

CountVectorized train dataset shape:  (54879, 3001)
CountVectorized test dataset shape:  (19617, 3001)


In [None]:
# TF-IDF Vectorizer
tfidf_vec=TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.4, sublinear_tf=True, norm='l2', max_features=args.max_features) 
train_tfidf=tfidf_vec.fit_transform(train['text'])
print("TF-IDF Vectorized train dataset shape: ", train_tfidf.toarray().shape)

# CSR 형식으로 변경
X_tfidf = train_tfidf.tocsr().toarray()

# Test Data에도 동일하게 적용
test_tfidf = tfidf_vec.transform(test['text'])
test_tfidf = test_tfidf.tocsr().toarray()

TF-IDF Vectorized train dataset shape:  (54879, 3000)


In [None]:
y = train['author']
if args.vectorizer == 'tf-idf':
    dtest = test_tfidf
elif args.vectorizer == 'count':
    dtest = test_cv

# Modeling

In [None]:
Dparam = {'objective' : 'multiclass',
          'boosting_type': 'gbdt',
          'num_class': len(np.unique(y)),
          'metric' : 'multi_logloss',
          #'max_bin':350,
          'max_depth':25,
          'min_child_weight': 8,
          'bagging_fraction':0.75,
          'feature_fraction':0.75,
          'lambda_l1':0.3,
          'lambda_l2':0.7,
          'num_leaves':31} 

In [None]:
print("Training Model...")

FOLDS = args.folds.split(X_cv)
oof_preds = np.zeros((train.shape[0], 5))
preds = np.zeros((test.shape[0], 5))

for n_fold, (trn_idx, val_idx) in enumerate(FOLDS):
    dtrain = gbm.Dataset(X_cv.iloc[trn_idx], y[trn_idx])
    dval = gbm.Dataset(X_cv.iloc[val_idx], y[val_idx])
    m_gbm = gbm.train(params=Dparam, train_set=dtrain, num_boost_round=args.num_boost_round, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], early_stopping_rounds=args.early_stopping_rounds, verbose_eval=50)
    oof_preds[val_idx] = m_gbm.predict(X_cv.iloc[val_idx])
    preds += m_gbm.predict(dtest) / args.folds.n_splits
    print('Fold %2d log_loss : %.6f' % (n_fold + 1, log_loss(y.iloc[val_idx], oof_preds[val_idx])))
    del dtrain, dval
    gc.collect()
    
print('Full log_loss score %.6f' % log_loss(y, oof_preds))   


Training Model...
Training until validation scores don't improve for 50 rounds.
[50]	train's multi_logloss: 1.06711	valid's multi_logloss: 1.10508
[100]	train's multi_logloss: 0.942118	valid's multi_logloss: 1.00429
[150]	train's multi_logloss: 0.871029	valid's multi_logloss: 0.952655
[200]	train's multi_logloss: 0.821347	valid's multi_logloss: 0.919895
[250]	train's multi_logloss: 0.783282	valid's multi_logloss: 0.896911
[300]	train's multi_logloss: 0.752728	valid's multi_logloss: 0.879723
[350]	train's multi_logloss: 0.727708	valid's multi_logloss: 0.867085
[400]	train's multi_logloss: 0.706204	valid's multi_logloss: 0.856929
[450]	train's multi_logloss: 0.687668	valid's multi_logloss: 0.849185
[500]	train's multi_logloss: 0.67136	valid's multi_logloss: 0.84289
[550]	train's multi_logloss: 0.657109	valid's multi_logloss: 0.838286
[600]	train's multi_logloss: 0.643933	valid's multi_logloss: 0.834457
[650]	train's multi_logloss: 0.632136	valid's multi_logloss: 0.831422
[700]	train's mu

# Submission

In [None]:
sample_submission[['0','1','2','3','4']] = preds
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.017600,0.913231,0.037650,0.028476,0.003043
1,1,0.125917,0.173701,0.026870,0.053761,0.619751
2,2,0.859690,0.010200,0.008929,0.001569,0.119612
3,3,0.025565,0.000393,0.834284,0.001721,0.138037
4,4,0.340852,0.111684,0.013465,0.114802,0.419197
...,...,...,...,...,...,...
19612,19612,0.003693,0.995977,0.000015,0.000269,0.000046
19613,19613,0.394639,0.006953,0.004664,0.008440,0.585304
19614,19614,0.024630,0.971721,0.000435,0.002954,0.000260
19615,19615,0.008659,0.964093,0.017759,0.007966,0.001523


In [None]:
sample_submission.to_csv(args.fianl_submission_dir, index=False, encoding = 'utf-8')