# 데모

## 라이브러리 import 및 설정

In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [24]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings
import nltk
import xgboost as xgb
from sklearn.decomposition import IncrementalPCA

In [8]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [9]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [10]:
algo_name = 'lgbm'
feature_name = 'tfidf-pca'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}_oof_pred_ver1.csv'
p_tst_file =  tst_dir / f'{model_name}_test_pred_ver1.csv'
sub_file = sub_dir / f'{model_name}_submission_ver1.csv'

In [11]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [12]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


## NLTK 예시

In [13]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [14]:
s = trn.text[4]
print(s)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [15]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
#nltk.download('t')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dku\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
tokens = word_tokenize(s)
print(tokens)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']


In [17]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]

['“',
 'Have',
 'mercy',
 ',',
 'gentleman',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'Don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'Here',
 'I',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asunder',
 'before',
 'you',
 ',',
 'and',
 'you',
 'seize',
 'the',
 'opportunity',
 'and',
 'are',
 'fingering',
 'the',
 'wound',
 'in',
 'both',
 'half',
 '....',
 'Oh',
 ',',
 'my',
 'God',
 '!',
 '”']

In [18]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]

['“',
 'have',
 'merci',
 ',',
 'gentlemen',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'here',
 'i',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asund',
 'befor',
 'you',
 ',',
 'and',
 'you',
 'seiz',
 'the',
 'opportun',
 'and',
 'are',
 'finger',
 'the',
 'wound',
 'in',
 'both',
 'halv',
 '....',
 'oh',
 ',',
 'my',
 'god',
 '!',
 '”']

## Bag-of-Words 피처 생성

In [19]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

(54879, 2685)


In [20]:
X_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [21]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 5897) (19617, 5897)


In [40]:
X[0, :50].todense()

numpy.matrix

## PCA

In [28]:
np.array_split?

In [26]:
X.shape

(54879, 5897)

In [None]:
n_batches = 39
inc_pca = IncrementalPCA(n_components=500)#temp.size*0.5)
for batch_x in np.array_split(X.todense(), n_batches):
    print(".", end="") # not shown in the book
    inc_pca.partial_fit(batch_x)

# incrementalPCA로 PCA 값을 생성
X_inc = inc_pca.transform(X.todense())    

inc_pca = IncrementalPCA(n_components=500)#temp.size*0.5)
for batch_x in np.array_split(X_tst.todense(), n_batches):
    print(".", end="") # not shown in the book
    inc_pca.partial_fit(batch_x)
    
# incrementalPCA로 PCA 값을 생성
X_tst_inc = inc_pca.transform(X_tst.todense())

.......................

In [None]:
X_inc.shape, X_tst_inc.shape

## LGBM 모델 학습

In [18]:
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [19]:
y = trn.author.values
y.shape

(54879,)

In [None]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_inc, y)) :
    
    print('## cv : ', n_fold)
    
    # split
    trn_x, trn_y = X_inc[trn_idx], y_inc[trn_idx]
    val_x, val_y = X_inc[val_idx], y_inc[val_idx]
    
    print('모델 생성')
    lr_clf = LGBMClassifier(learning_rate = 0.005, max_depth  = 40, n_estimators  = 15000,
                            num_leaves = 512,
                           feature_fraction = 0.9,
                           bagging_fraction = 0.7,      
                           seed = seed)  
    
    print('모델 생성 후 학습 시작')
    lr_clf.fit(
            trn_x, trn_y,
            eval_set = [(val_x, val_y)],
            eval_metric = 'logloss',
            verbose =100, 
            early_stopping_rounds = 30
        )
    
    p += lr_clf.predict_proba(X_tst_inc) / folds.n_splits
    p_tst[val_idx, :] = lr_clf.predict_proba(val_data)
    

## cv :  0
모델 생성
모델 생성 후 학습 시작
Training until validation scores don't improve for 30 rounds
[100]	valid_0's multi_logloss: 1.15777
[200]	valid_0's multi_logloss: 0.982617
[300]	valid_0's multi_logloss: 0.885796
[400]	valid_0's multi_logloss: 0.82425
[500]	valid_0's multi_logloss: 0.784289
[600]	valid_0's multi_logloss: 0.757582
[700]	valid_0's multi_logloss: 0.738715
[800]	valid_0's multi_logloss: 0.724303
[900]	valid_0's multi_logloss: 0.71298
[1000]	valid_0's multi_logloss: 0.704111
[1100]	valid_0's multi_logloss: 0.69674
[1200]	valid_0's multi_logloss: 0.690636
[1300]	valid_0's multi_logloss: 0.685434
[1400]	valid_0's multi_logloss: 0.680977
[1500]	valid_0's multi_logloss: 0.676954
[1600]	valid_0's multi_logloss: 0.673344
[1700]	valid_0's multi_logloss: 0.67006


In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

In [None]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[sub.columns] = p_tst
sub.head()

In [None]:
sub.to_csv(sub_file)