# 라이브러리 로드

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from matplotlib import rcParams, pyplot as plt

import numpy as np
import pandas as pd
import random as rn
import re

from pathlib import Path
import warnings

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')

warnings.filterwarnings('ignore')

# 학습데이터 로드

In [4]:
seed = 2020
# np.random.seed(seed)
# rn.seed(seed)
# tf.random.set_seed(seed)
# session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,
#                              inter_op_parallelism_threads=1)
# sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
# tf.compat.v1.keras.backend.set_session(sess)

In [5]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5

In [6]:
feature_name = 'feature'

feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
feature_Ver3_file = feature_dir / f'{feature_name}_Ver3.csv'
feature_Ver4_file = feature_dir / f'{feature_name}_Ver4.csv'
feature_Ver5_file = feature_dir / f'{feature_name}_Ver5.csv'

feature_target_file = feature_dir / f'{feature_name}_target.csv'

# 변수 생성

- stopwords 제거 X -> Ver1
- stopwords 제거 O -> Ver2
- 중복 Top 100 모두 제거 -> Ver3
- 중복 Top 100중 min*2 <= max 라면 제거 -> Ver4
- tl-idf PCA -> Ver5

In [7]:
# 전테 탑 N에 있는 단어들 가져오는 함수
def get_top_n_vocab(data, top=None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    
    if top is None:
        vocab_size = len(tokenizer.word_index)
    else:
        vocab_size = min(top,len(tokenizer.word_index))
    
    vocab = {}
    for word, index in tokenizer.word_index.items(): 
        if index == vocab_size+1:
            break
        vocab[word] = (index, tokenizer.word_counts[word])
        
    return vocab

In [8]:
# 전체 탑 100에 있는 단어들 중, 모든 작가에서 발견된 단어들 체크
# 그리고 그 단어들 개수 체크
# 그리고 작가별 단어 빈도수도 체크
def check_vocab_in_author(vocab, vocab_authors):
    cnt = 0 # 모든 작가에서 발견된 단어인지 카운트
    cnt_frequencys = [] # 작가별 빈도수 카운트
    words = {} # 모든 작가에서 발견된 단어들 + 작가별 빈도수
    
    for key in vocab.keys():
        for vocab_author in vocab_authors:
            if key in vocab_author:
                cnt += 1
                cnt_frequencys.append(vocab_author.get(key)[1])
        
        if cnt==5:
            words[key] = tuple(cnt_frequencys) 
            
        cnt = 0
        cnt_frequencys.clear()
    
    return words

In [9]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9' ]", '', text)

# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

In [10]:
df_train = pd.read_csv(trn_file, index_col=0)
df_test = pd.read_csv(tst_file, index_col=0)

In [11]:
#전처리 적용
df_train['text'] = df_train['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()

df_train['text'] = df_train['text'].apply(alpha_num)
df_test['text'] = df_test['text'].apply(alpha_num)

## Ver 1 생성

In [12]:
df_train_ver1 = df_train.copy()
df_test_ver1 = df_test.copy()

In [13]:
dataset = pd.concat([df_train_ver1,df_test_ver1], axis=0)
dataset.fillna(-1, inplace=True)

In [14]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver1_file)

## Ver 2 생성

In [15]:
df_train_ver2 = df_train.copy()
df_test_ver2 = df_test.copy()

In [16]:
# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

df_train_ver2['text'] = df_train_ver2['text'].apply(remove_stopwords)
df_test_ver2['text'] = df_test_ver2['text'].apply(remove_stopwords)

In [17]:
# stopword를 제거 했을 때, 문장 하나가 공백이 되는 현상이 발생함.
# 스태킹을 하기 위해서, 행을 버리는 것은 부담감이 있음.
# 따라서, !!!!를 채운 다음에, 학습 코드에서 제거할 예정
df_train_ver2=df_train_ver2.replace(r'','!!!!')

In [18]:
dataset = pd.concat([df_train_ver2,df_test_ver2], axis=0)
dataset.fillna(-1, inplace=True)

In [19]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver2_file)

## Ver3 생성

In [20]:
df_train_ver3 = df_train.copy()
df_test_ver3 = df_test.copy()

In [21]:
# train test 분리
X_train = df_train_ver3['text'].values
X_test = df_test_ver3['text'].values
y_train = df_train_ver3[target_col].values
print(X_train.shape, X_test.shape, y_train.shape)

(54879,) (19617,) (54879,)


In [22]:
all_vocab = get_top_n_vocab(X_train)

author_data = []
for i in range(len(df_train_ver3[target_col].unique())):
    temp = df_train_ver3[df_train_ver3[target_col]==i]['text']
    author_data.append(temp)
    
author_vocab = []
for i in range(len(df_train_ver3[target_col].unique())):
    temp = get_top_n_vocab(author_data[i],100)
    author_vocab.append(temp)
    
check_author_vocab = check_vocab_in_author(all_vocab, author_vocab)

# 불용어
stopwords =[word for word in check_author_vocab.keys()]

df_train_ver3['text'] = df_train_ver3['text'].apply(remove_stopwords)
df_test_ver3['text'] = df_test_ver3['text'].apply(remove_stopwords)

In [35]:
for i,j in enumerate(df_train_ver3['text'].values):
    if type(j) == float:
        print(i,j)

In [None]:
for i,j in enumerate()

In [23]:
dataset = pd.concat([df_train_ver3,df_test_ver3], axis=0)
dataset.fillna(-1, inplace=True)

In [24]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver3_file)

## Ver4 데이터 로드

In [25]:
df_train_ver4 = df_train.copy()
df_test_ver4 = df_test.copy()

In [26]:
# train test 분리
X_train = df_train_ver4['text'].values
X_test = df_test_ver4['text'].values
y_train = df_train_ver4[target_col].values
print(X_train.shape, X_test.shape, y_train.shape)

(54879,) (19617,) (54879,)


In [27]:
all_vocab = get_top_n_vocab(X_train)

author_data = []
for i in range(len(df_train_ver4[target_col].unique())):
    temp = df_train_ver4[df_train_ver4[target_col]==i]['text']
    author_data.append(temp)
    
author_vocab = []
for i in range(len(df_train_ver4[target_col].unique())):
    temp = get_top_n_vocab(author_data[i],100)
    author_vocab.append(temp)
    
check_author_vocab = check_vocab_in_author(all_vocab, author_vocab)

# 불용어
stopwords =[]
for word, feq in check_author_vocab.items():
    word_min, word_max = min(feq), max(feq)
    check = word_min*2 >= word_max
    if check:
        stopwords.append(word)
        
df_train_ver4['text'] = df_train_ver4['text'].apply(remove_stopwords)
df_test_ver4['text'] = df_test_ver4['text'].apply(remove_stopwords)

In [28]:
dataset = pd.concat([df_train_ver4,df_test_ver4], axis=0)
dataset.fillna(-1, inplace=True)

In [29]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver4_file)

# 실제 타겟 값 생성

In [30]:
y = df_train.loc[:, target_col]
y.to_csv(feature_target_file)

In [14]:
# submission 파일 생성
sub = pd.read_csv(sample_file,index_col=0)

for filename, test_pred in zip(stacking_submission_files, lgbm_test_preds):
    sub[sub.columns] = test_pred
    sub.to_csv(filename)

In [15]:
# stacking_oof_pred 파일 생성

for filename, oof_pred in zip(stacking_oof_pred_files, lgbm_oof_preds):
    np.savetxt(filename, oof_pred, fmt='%.18f', delimiter=',')

In [16]:
# stacking_test_pred 파일 생성

for filename, test_pred in zip(stacking_test_pred_files, lgbm_test_preds):
    np.savetxt(filename, test_pred, fmt='%.18f', delimiter=',')

In [None]:
# submission 파일 생성
sub = pd.read_csv(sample_file,index_col=0)

for filename, test_pred in zip(stacking_submission_files, lgbm_test_preds):
    sub[sub.columns] = test_pred
    sub.to_csv(filename)

# stacking_oof_pred 파일 생성

for filename, oof_pred in zip(stacking_oof_pred_files, lgbm_oof_preds):
    np.savetxt(filename, oof_pred, fmt='%.18f', delimiter=',')

# stacking_test_pred 파일 생성

for filename, test_pred in zip(stacking_test_pred_files, lgbm_test_preds):
    np.savetxt(filename, test_pred, fmt='%.18f', delimiter=',')