## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import re

from pathlib import Path
import warnings

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [4]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020 

In [5]:
algo_name = 'lr'
feature_name = 'lemmatization-tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

sub_file = sub_dir / f'{model_name}.csv'

In [6]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [7]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


# 데이터 전처리

In [8]:
# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [9]:
# class CustomTfidVectorier(TfidfVectorizer):
    
#     def build_analyzer(self):
    
#         stop_words = self.get_stop_words()
       
#         # 단어의 품사 정보 얻는 함수
#         def get_wordnet_pos(word):
#             tag = pos_tag([word])[0][1][0].upper()
#             tag_dict = {
#                 "J" : wordnet.ADJ,
#                 "N" : wordnet.NOUN,
#                 "V" : wordnet.VERB,
#                 "R" : wordnet.ADV
#             }
            
#             return tag_dict.get(tag, wordnet.NOUN)
       
#         def analyser(doc):
#             # 부호를 제거
#             doc_clean = re.sub(r'[^A-Za-z0-9 ]', '', doc)
            
#             # 토큰화
#             doc_clean = word_tokenize(doc_clean)
        
#             # 품사 정보를 이용해서 표제어 추출
#             lemmatizer = WordNetLemmatizer()
#             tokenized_word = [lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in doc_clean] 
       
#             # 내장된 _word_ngrams 사용
#             # 불용어 까지 같이 처리
#             return self._word_ngrams(tokenized_word, stop_words)
#         return(analyser)

In [12]:
class CustomTfidVectorier(TfidfVectorizer):
    
    def build_analyzer(self):
       
        # 단어의 품사 정보 얻는 함수
        def get_wordnet_pos(word):
            if word.startswith('J'):
                return wordnet.ADJ
            elif word.startswith('V'):
                return wordnet.VERB
            elif word.startswith('N'):
                return wordnet.NOUN
            elif word.startswith('R'):
                return wordnet.ADV
            else:
                return None
       
        def analyser(doc):
            # 부호를 제거
            doc_clean = re.sub(r'[^A-Za-z0-9 ]', '', doc)
            
            # 토큰화
            doc_clean = word_tokenize(doc_clean)
            
            # 태깅정보 획득 및 wordnetlemmatizer에 알맞는 태깅정보로 변환
            pos_tagged = pos_tag(doc_clean)
            wordnet_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), pos_tagged))
            
            # 품사 정보를 이용해서 표제어 추출
            tokenized_word = []
            lemmatizer = WordNetLemmatizer()
            for word, tag in wordnet_tagged:
                if tag is None:
                    tokenized_word.append(word)
                else:
                    tokenized_word.append(lemmatizer.lemmatize(word,tag))
       
            # 내장된 _word_ngrams 사용
            return self._word_ngrams(tokenized_word)
        return(analyser)

In [13]:
# TF-IDF 피쳐 생성 - using word_tokenize
vec = CustomTfidVectorier(ngram_range=(1, 3), min_df=50)
X_1 = vec.fit_transform(trn['text'])
X_tst_1 = vec.transform(tst['text'])
print(X_1.shape, X_tst_1.shape)

(54879, 9278) (19617, 9278)


## 로지스틱회귀 모델 학습

In [14]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [15]:
y = trn.author.values
y.shape

(54879,)

In [17]:
p_val = np.zeros((X_1.shape[0], n_class))
p_tst = np.zeros((X_tst_1.shape[0], n_class))

for i_cv, (i_trn, i_val) in enumerate(cv.split(X_1,y), 1):
    print(f'Training model for CV #{i_cv}')
    X_train, X_val = X_1[i_trn], X_1[i_val]
    y_train, y_val = y[i_trn], y[i_val]
    
    lr_clf = LogisticRegression()
    lr_clf.fit(X_train,y_train)
    
    p_val[i_val, :] = lr_clf.predict_proba(X_1[i_val])
    p_tst += lr_clf.predict_proba(X_tst_1) / n_class
    
print("Training has finished")
print("*"*100)
    
print(f'lr Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'lr Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Training model for CV #1
Training model for CV #2
Training model for CV #3
Training model for CV #4
Training model for CV #5
Training has finished
****************************************************************************************************
lr Accuracy (CV):  73.1974%
lr Log Loss (CV):   0.7740
