- code written by ChatGPT
- kernel: tf_env (Python 3.10.16)

In [3]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# 데이터 로드
url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt"
df = pd.read_csv(url, sep='\t').dropna()

In [5]:
# 데이터 전처리
def clean_text(text):
    text = re.sub("[^가-힣0-9a-zA-Z ]", "", str(text))
    return text

df['document'] = df['document'].apply(clean_text)

In [6]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df['document'], df['label'], test_size=0.2, random_state=42)

In [None]:
# 토큰화 및 패딩
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [8]:
# 모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [9]:
# 모델 학습
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 81ms/step - accuracy: 0.5025 - loss: 0.6934 - val_accuracy: 0.4985 - val_loss: 0.6932
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 87ms/step - accuracy: 0.4975 - loss: 0.6932 - val_accuracy: 0.5015 - val_loss: 0.6932
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 81ms/step - accuracy: 0.4979 - loss: 0.6932 - val_accuracy: 0.5015 - val_loss: 0.6931
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 78ms/step - accuracy: 0.5039 - loss: 0.6931 - val_accuracy: 0.4985 - val_loss: 0.6932
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 79ms/step - accuracy: 0.4998 - loss: 0.6932 - val_accuracy: 0.4985 - val_loss: 0.6932


<keras.src.callbacks.history.History at 0x29585c595d0>

In [10]:
# 모델 평가
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 18ms/step - accuracy: 0.4963 - loss: 0.6932
Test Accuracy: 0.4985
