<a href="https://colab.research.google.com/github/BaekKyunShin/Kaggle-Competition-Including-Dacon/blob/master/Novel_Writer_Classification/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM

In [19]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [20]:
import pandas as pd
import numpy as np

# nltk
import nltk
from nltk.corpus import stopwords 

# 모델링
import lightgbm as gbm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

# 기타
import os
import re
import easydict
import gc

In [21]:
# 전역변수 설정
args = easydict.EasyDict({ 'chdir': '/content/gdrive/My Drive/colab/Dacon_Novel_Writer_Classification/',
                          'train_dir': 'open/train.csv', 
                          'test_dir': 'open/test_x.csv',
                          'submission_dir': 'open/sample_submission.csv',
                          'fianl_submission_dir': 'open/submission.csv',
                          'max_len': 100,
                          'epochs': 1,
                          'learning_rate': 0.01,
                          'wd': 1e-5,
                          'batch_size': 64,
                          'folds': KFold(n_splits=5, shuffle=True, random_state=1991)
                          'train': True})

In [22]:
# 경로 설정
os.chdir(args.chdir)

# 파일 불러오기
train = pd.read_csv(args.train_dir, encoding='utf-8')
test = pd.read_csv(args.test_dir, encoding='utf-8')
sample_submission = pd.read_csv(args.submission_dir, encoding='utf-8')

train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [23]:
train.head(2)

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2


In [24]:
test.head(2)

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."


In [25]:
sample_submission.head(2)

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0


# Preprocessing

In [26]:
# 불용어
basic_stopwords = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

final_stopwords = nltk_stopwords.union(basic_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# 부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# 불용어 제거해주는 함수
def remove_stopwords(text: str):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in final_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)
    
# 전처리 적용
train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

# Vectorizing

In [28]:
# Count Vectorizer
count_vec=CountVectorizer(ngram_range=(1,2), min_df=5, binary=True, max_features=3000) 
train_cv=count_vec.fit_transform(train['text'])
print("CountVectorized train dataset shape: ", train_cv.toarray().shape)

# CSR 형식으로 변경
X_cv = train_cv.tocsr().toarray()

# Test Data에도 동일하게 적용
test_cv = count_vec.transform(test['text'])
test_cv = test_cv.tocsr().toarray()

CountVectorized train dataset shape:  (54879, 3000)


In [29]:
# TF-IDF Vectorizer
tfidf_vec=TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.4, sublinear_tf=True, norm='l2', max_features=5500) 
train_tfidf=tfidf_vec.fit_transform(train['text'])
print("TF-IDF Vectorized train dataset shape: ", train_tfidf.toarray().shape)

# CSR 형식으로 변경
X_tfidf = train_tfidf.tocsr().toarray()

# Test Data에도 동일하게 적용
test_tfidf = tfidf_vec.transform(test['text'])
test_tfidf = test_tfidf.tocsr().toarray()

TF-IDF Vectorized train dataset shape:  (54879, 5500)


In [30]:
y = train['author']
dtest = test['text']

# Modeling

In [31]:
Dparam = {'objective' : 'multiclass',
          'boosting_type': 'gbdt',
          'num_class': len(np.unique(y)),
          'metric' : 'multi_logloss',
          #'max_bin':350,
          'max_depth':20,
          'min_child_weight': 8,
          'bagging_fraction':0.75,
          'feature_fraction':0.75,
          'lambda_l1':0.1,
          'lambda_l2':0.1,
          'num_leaves':31} 

In [32]:
print("Training Model")

FOLDS = args.folds.split(X_cv)
oof_preds = np.zeros((train.shape[0], 5))
preds = np.zeros((test.shape[0], 5))

for n_fold, (trn_idx, val_idx) in enumerate(FOLDS):
    dtrain = gbm.Dataset(X_cv[trn_idx], y[trn_idx])
    dval = gbm.Dataset(X_cv[val_idx], y[val_idx])
    m_gbm = gbm.train(params=Dparam, train_set=dtrain, num_boost_round=500, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], early_stopping_rounds=50, verbose_eval=50)
    oof_preds[val_idx] = m_gbm.predict(X_cv[val_idx])
    preds += m_gbm.predict(test_cv) / folds.n_splits
    print('Fold %2d log_loss : %.6f' % (n_fold + 1, log_loss(y.iloc[val_idx], oof_preds[val_idx])))
    del dtrain, dval
    gc.collect()
    
print('Full log_loss score %.6f' % log_loss(y, oof_preds))   


Training Model
[50]	train's multi_logloss: 1.08529	valid's multi_logloss: 1.10801
[100]	train's multi_logloss: 0.964401	valid's multi_logloss: 1.01106
[150]	train's multi_logloss: 0.895627	valid's multi_logloss: 0.959538
[200]	train's multi_logloss: 0.847641	valid's multi_logloss: 0.926687
[250]	train's multi_logloss: 0.810251	valid's multi_logloss: 0.903244
[300]	train's multi_logloss: 0.780516	valid's multi_logloss: 0.886156
[350]	train's multi_logloss: 0.755455	valid's multi_logloss: 0.872705
[400]	train's multi_logloss: 0.734003	valid's multi_logloss: 0.861897
[450]	train's multi_logloss: 0.7152	valid's multi_logloss: 0.853349
[500]	train's multi_logloss: 0.698816	valid's multi_logloss: 0.846342
Fold  1 log_loss : 0.846342
[50]	train's multi_logloss: 1.08003	valid's multi_logloss: 1.12726
[100]	train's multi_logloss: 0.958278	valid's multi_logloss: 1.03483
[150]	train's multi_logloss: 0.889107	valid's multi_logloss: 0.985649
[200]	train's multi_logloss: 0.840695	valid's multi_loglo

# Submission

In [33]:
sample_submission[['0','1','2','3','4']] = preds
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.038539,0.884476,0.038047,0.032532,0.006406
1,1,0.198288,0.237068,0.032878,0.075781,0.455985
2,2,0.856151,0.028874,0.019032,0.006149,0.089794
3,3,0.049358,0.000762,0.724488,0.001588,0.223805
4,4,0.257951,0.186101,0.051453,0.188975,0.315519
...,...,...,...,...,...,...
19612,19612,0.003735,0.995309,0.000091,0.000670,0.000195
19613,19613,0.505713,0.003807,0.008349,0.002868,0.479263
19614,19614,0.025916,0.968958,0.000621,0.003894,0.000611
19615,19615,0.034191,0.874834,0.053031,0.030922,0.007023


In [34]:
sample_submission.to_csv(args.fianl_submission_dir, index=False, encoding = 'utf-8')