# Install Package

# Import Module

In [2]:
# 기본 패키지
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# 시각화 패키지
import matplotlib.pyplot as plt
import seaborn as sns

# 정규화 패키지
import re

# 모델링 패키지
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# NLP 패키지
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 경고무시
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
df = pd.read_csv('Petition_data_english.csv')
df = df.loc[:, ['content_eng', 'label']]
df.head()

Unnamed: 0,content_eng,label
0,The development of petition AI image generator...,1
1,A petition for severe punishment and disclosur...,1
2,Do you know that the petition government on th...,1
3,KEPCO's petition against privatization of publ...,1
4,I entered OO Elementary School in 2003 and tra...,1


# Data Preprocessing


In [16]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 소문자 변환
    text = text.lower()
    
    # 특수문자 제거
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # 숫자 제거
    text = re.sub('\w*\d\w*', '', text)
    
    # 불용어 처리
    words = text.split()
    words = [word for word in words if not word in stop_words]
    text = " ".join(words)
    
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
df['tokenized'] = df['content_eng'].apply(preprocess_text)
df.head()

Unnamed: 0,content_eng,label,tokenized
0,The development of petition AI image generator...,1,development petition ai image generators legal...
1,A petition for severe punishment and disclosur...,1,petition severe punishment disclosure perpetra...
2,Do you know that the petition government on th...,1,know petition government reduction additional ...
3,KEPCO's petition against privatization of publ...,1,kepcos petition privatization public corporati...
4,I entered OO Elementary School in 2003 and tra...,1,entered oo elementary school transferred oo el...


#### Train, Test 분리

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized'], df['label'], stratify = df['label'], random_state = 42, test_size=0.1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((485,), (54,), (485,), (54,))

#### 토크나이징 및 패딩작업 진행

In [19]:
def preprocess(df):
    
    # 전처리된 문장에서 토큰 생성
    tokenizer = Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(df)
    tokens = tokenizer.texts_to_sequences(df)

    df_wordcount = pd.DataFrame(list(tokenizer.word_counts.items()), columns=['word', 'count'])
    wc = df_wordcount[df_wordcount['count']<2].shape[0]
    vocab_size = len(tokenizer.word_index) - wc + 2
    print('단어 집합의 크기:', vocab_size)
    
    # 패딩
    max_seq_len = max([len(token_list) for token_list in tokens])
    padded_tokens = pad_sequences(tokens, maxlen=max_seq_len, padding='post')

    return padded_tokens

In [20]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)

단어 집합의 크기: 6266
단어 집합의 크기: 1678


# Modeling

In [21]:
vocab_size, embedding_dim = 6270, 256
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim),
    LSTM(units=128),
    Dense(units=64, activation='relu'),
    Dense(units=32, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 256)         1605120   
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,812,609
Trainable params: 1,812,609
Non-trainable params: 0
_________________________________________________________________


In [22]:
earlystopping = EarlyStopping(monitor="val_loss", patience = 5)

checkpoint = ModelCheckpoint('lstm_eng.h5',
                             save_best_only=True,
                             save_weights_only=True,
                             monitor='val_loss',
                             mode='min',
                             verbose=False)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Recall(name='recall')])
history = model.fit(X_train, y_train, epochs=50, callbacks=[checkpoint, earlystopping], batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [23]:
# 저장된 checkpoint 로드 후 정확도 측정
model.load_weights('lstm_eng.h5')
model.evaluate(X_test, y_test)



[0.4639734923839569, 0.8148148059844971]

In [24]:
model.predict(X_test)[0]



array([0.18209743], dtype=float32)