# 데모

## 라이브러리 import 및 설정

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings
import nltk
import xgboost as xgb

In [None]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [None]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [None]:
algo_name = 'lgbm'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}_oof_pred_ver1.csv'
p_tst_file =  tst_dir / f'{model_name}_test_pred_ver1.csv'
sub_file = sub_dir / f'{model_name}_submission_ver1.csv'

In [None]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

In [None]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

## NLTK 예시

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [None]:
s = trn.text[4]
print(s)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
#nltk.download('t')

In [None]:
tokens = word_tokenize(s)
print(tokens)

In [None]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]

In [None]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]

## Bag-of-Words 피처 생성

In [None]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

In [None]:
X_cnt[0, :50].todense()

In [None]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

In [None]:
X[0, :50].todense()

## LGBM 모델 학습

In [None]:
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [None]:
y = trn.author.values
y.shape

In [None]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)) :
    
    print('## cv : ', n_fold)
    
    # split
    trn_x, trn_y = X[trn_idx], y[trn_idx]
    val_x, val_y = X[val_idx], y[val_idx]
    
    print('모델 생성')
    lr_clf = LGBMClassifier(learning_rate = 0.005, max_depth  = 40, n_estimators  = 15000,
                            num_leaves = 512,
                           feature_fraction = 0.9,
                           bagging_fraction = 0.7,      
                           seed = seed)  
    
    print('모델 생성 후 학습 시작')
    lr_clf.fit(
            trn_x, trn_y,
            eval_set = [(val_x, val_y)],
            eval_metric = 'logloss',
            verbose =100, 
            early_stopping_rounds = 30
        )
    
    p_tst += lr_clf.predict_proba(X_tst) / folds.n_splits
    p[val_idx, :] = lr_clf.predict_proba(val_x)
    

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

In [None]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[sub.columns] = p_tst
sub.head()

In [None]:
sub.to_csv(sub_file)