In [5]:
import nltk # 텍스트 데이터를 처리
import numpy as np # 말뭉치를 배열로 표현
import random
import operator
import re

from sklearn.metrics.pairwise import cosine_similarity # 이를 나중에 사용하여 두 개의 문장이 얼마나 비슷한지를 결정합니다.
from sklearn.feature_extraction.text import TfidfVectorizer # Experience 2에서 단어 가방을 만드는 함수를 만들었던 것을 기억하십니까? 이 함수는 같은 일을 합니다!
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from datetime import datetime
import matplotlib.pyplot as plt

## 2. 라벨링

In [6]:
import os
import numpy as np

def loadfile(path):
    X = []
    Y = []
    label_map = {'0': 0, '25': 1, '50': 2, '75': 3, '100': 4}
    for label in label_map.keys():
        label_path = os.path.join(path, label)
        for filename in os.listdir(label_path):
            if filename.endswith('.txt'):
                with open(os.path.join(label_path, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                Y.append(label_map[label])
                X.append(text)
    return np.array(X), np.array(Y)

# 경로를 로컬 시스템의 경로로 변경
directory_path = r'C:\Users\MyoengHo Shin\Desktop\likeability_son'

# loadfile 함수 호출
X, Y = loadfile(directory_path)

In [7]:
# X,Y 차원 확인
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (500,)
Y shape: (500,)


In [8]:
## 파일 갯수 확인

def count_txt_files(path):
    file_counts = {}

    for label in ('0', '25', '50', '75', '100'):
        label_path = os.path.join(path, label)
        txt_files = [filename for filename in os.listdir(label_path) if filename.endswith('.txt')]
        file_counts[label] = len(txt_files)

    return file_counts

# .txt 파일 수 확인
file_counts = count_txt_files(directory_path)

# 결과 출력
for label, count in file_counts.items():
    print(f"Number of .txt files in label {label}: {count}")

Number of .txt files in label 0: 100
Number of .txt files in label 25: 100
Number of .txt files in label 50: 100
Number of .txt files in label 75: 100
Number of .txt files in label 100: 100


## 3. 데이터 전처리

In [9]:
# 텍스트 전처리
def clean_text(text):
    text = re.sub(r"[^가-힝A-Za-z0-9(),!?\'\`]", " ", text)
    text = text.lower()
    text = re.sub(r"\'s", " \'s", text)
    text = re.sub(r"\'ve", " \'ve", text)
    text = re.sub(r"n\'t", " n\'t", text)
    text = re.sub(r"\'re", " \'re", text)
    text = re.sub(r"\'d", " \'d", text)
    text = re.sub(r"\'ll", " \'ll", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    text = re.sub(r":", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()


In [10]:
#이모지 사용

# def count_emojis(text):
#     emoji_pattern = re.compile('['
#         u'\U0001F600-\U0001F64F'  # emoticons
#         u'\U0001F300-\U0001F5FF'  # symbols & pictographs
#         ']', flags=re.UNICODE)
#     return len(emoji_pattern.findall(text))


In [11]:
#데이터 로드 및 전처리
import re

directory_path = r'C:\Users\MyoengHo Shin\Desktop\likeability_son'
X, Y = loadfile(directory_path)
X = [clean_text(text) for text in X]

In [12]:
# 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


## 4.토크나이징, 임베딩, 시퀀싱

In [13]:
# 토큰화 및 시퀀스 패딩
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

In [10]:
#pip install tensorflow
#pip install --upgrade transformers
#pip uninstall tokenizers
#pip install tokenizers==0.13.0

In [9]:
# import tensorflow as tf
# print(tf.__version__)

In [15]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# BERT 토크나이저와 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# 입력 특성 정의
input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='attention_mask')

# BERT 모델에 전달
outputs = model(input_ids, attention_mask=attention_mask)

# 출력 레이어 추가
pooled_output = outputs.pooler_output
predictions = tf.keras.layers.Dense(units=5, activation='softmax')(pooled_output)

# 모델 컴파일
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=predictions)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


  from .autonotebook import tqdm as notebook_tqdm
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 12.3MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 440M/440M [00:37<00:00, 11.6MB/s] 





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions 




In [None]:
# import tensorflow as tf
# print("GPU Available: ", tf.test.is_gpu_available())

In [16]:
# 데이터 전처리 함수
def preprocess_data(tokenizer, texts, max_len):
    tokenized = tokenizer.batch_encode_plus(texts, max_length=max_len, padding='max_length', truncation=True)
    return np.array(tokenized['input_ids']), np.array(tokenized['attention_mask'])

# 전처리된 데이터 생성
X_train_ids, X_train_attention = preprocess_data(tokenizer, X_train, max_sequence_length)
X_test_ids, X_test_attention = preprocess_data(tokenizer, X_test, max_sequence_length)

# 모델 학습
history = model.fit(
    [X_train_ids, X_train_attention],
    y_train,
    epochs=100,
    batch_size=8,
    validation_split=0.2
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100

KeyboardInterrupt: 

In [None]:
test_loss, test_accuracy = model.evaluate([X_test_ids, X_test_attention], y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")