In [3]:
import nltk # 텍스트 데이터를 처리
import numpy as np # 말뭉치를 배열로 표현
import random
import operator
import re

from sklearn.metrics.pairwise import cosine_similarity # 이를 나중에 사용하여 두 개의 문장이 얼마나 비슷한지를 결정합니다.
from sklearn.feature_extraction.text import TfidfVectorizer # Experience 2에서 단어 가방을 만드는 함수를 만들었던 것을 기억하십니까? 이 함수는 같은 일을 합니다!
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from datetime import datetime
import matplotlib.pyplot as plt

from google.colab import drive

## 2. 라벨링

In [4]:
import os
import numpy as np

drive.mount('/content/drive')


def loadfile(path):
    X = []
    Y = []
    label_map = {'0': 0, '25': 1, '50': 2, '75': 3, '100': 4}
    for label in label_map.keys():
        label_path = os.path.join(path, label)
        for filename in os.listdir(label_path):
            if filename.endswith('.txt'):
                with open(os.path.join(label_path, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                Y.append(label_map[label])
                X.append(text)
    return np.array(X), np.array(Y)

# 경로를 구글 드라이브 경로로 변경
directory_path = '/content/drive/MyDrive/likeability_Son/'

# loadfile 함수 호출
X, Y = loadfile(directory_path)

Mounted at /content/drive


In [None]:
# X,Y 차원 확인
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (500,)
Y shape: (500,)


In [5]:
## 파일 갯수 확인

def count_txt_files(path):
    file_counts = {}

    for label in ('0', '25', '50', '75', '100'):
        label_path = os.path.join(path, label)
        txt_files = [filename for filename in os.listdir(label_path) if filename.endswith('.txt')]
        file_counts[label] = len(txt_files)

    return file_counts

# .txt 파일 수 확인
file_counts = count_txt_files(directory_path)

# 결과 출력
for label, count in file_counts.items():
    print(f"Number of .txt files in label {label}: {count}")

Number of .txt files in label 0: 100
Number of .txt files in label 25: 100
Number of .txt files in label 50: 100
Number of .txt files in label 75: 100
Number of .txt files in label 100: 100


## 3. 데이터 전처리

In [6]:
# 텍스트 전처리
def clean_text(text):
    text = re.sub(r"[^가-힝A-Za-z0-9(),!?\'\`]", " ", text)
    text = text.lower()
    text = re.sub(r"\'s", " \'s", text)
    text = re.sub(r"\'ve", " \'ve", text)
    text = re.sub(r"n\'t", " n\'t", text)
    text = re.sub(r"\'re", " \'re", text)
    text = re.sub(r"\'d", " \'d", text)
    text = re.sub(r"\'ll", " \'ll", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    text = re.sub(r":", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()


In [None]:
#이모지 사용

# def count_emojis(text):
#     emoji_pattern = re.compile('['
#         u'\U0001F600-\U0001F64F'  # emoticons
#         u'\U0001F300-\U0001F5FF'  # symbols & pictographs
#         ']', flags=re.UNICODE)
#     return len(emoji_pattern.findall(text))


In [7]:
#데이터 로드 및 전처리
import re

directory_path = '/content/drive/MyDrive/likeability_Son/'
X, Y = loadfile(directory_path)
X = [clean_text(text) for text in X]

In [8]:
# 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


## 4.토크나이징, 임베딩, 시퀀싱

In [9]:
# 토큰화 및 시퀀스 패딩
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

In [10]:
from transformers import BertTokenizer

# BERT 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# 예시 텍스트
texts = ["안녕하세요", "BERT 모델을 사용합니다"]

# 토큰 ID, 어텐션 마스크, 토큰 타입 ID 변환
encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="np")

input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']
token_type_ids = encoded_inputs['token_type_ids']  # 필요한 경우

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
#pip install tensorflow
#pip install --upgrade transformers
#pip uninstall tokenizers
#pip install tokenizers==0.13.0

In [None]:
# import tensorflow as tf
# print(tf.__version__)

In [11]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# BERT 토크나이저와 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# 입력 특성 정의
input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='attention_mask')

# BERT 모델에 전달
outputs = model(input_ids, attention_mask=attention_mask)

# 출력 레이어 추가
pooled_output = outputs.pooler_output
predictions = tf.keras.layers.Dense(units=5, activation='softmax')(pooled_output)

# 모델 컴파일
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=predictions)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# import tensorflow as tf
# print("GPU Available: ", tf.test.is_gpu_available())

In [12]:
# 데이터 전처리 함수
def preprocess_data(tokenizer, texts, max_len):
    tokenized = tokenizer.batch_encode_plus(texts, max_length=max_len, padding='max_length', truncation=True)
    return np.array(tokenized['input_ids']), np.array(tokenized['attention_mask'])

# 전처리된 데이터 생성
X_train_ids, X_train_attention = preprocess_data(tokenizer, X_train, max_sequence_length)
X_test_ids, X_test_attention = preprocess_data(tokenizer, X_test, max_sequence_length)

# 모델 학습
history = model.fit(
    [X_train_ids, X_train_attention],
    y_train,
    epochs=20,
    batch_size=8,
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# 평가 (테스트 및 검증 데이터셋)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")

ValueError: ignored

In [None]:
import matplotlib.pyplot as plt

# 훈련 및 검증 데이터에 대한 정확도와 손실 추출
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# 정확도 그래프
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# 손실 그래프
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# 모델 예측
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# 혼동 행렬 생성
cm = confusion_matrix(y_test, y_pred_classes)

# 혼동 행렬 시각화
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# 분류 보고서 출력 (Precision, Recall, F1-Score)
print(classification_report(y_test, y_pred_classes))

