### 推特留言情緒分析：比較 Word2Vec 和 Bert 表現

資料集名稱：Sentiment140 dataset with 1.6 million tweets

資料集來源：https://www.kaggle.com/datasets/kazanova/sentiment140/data

In [None]:
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string, nltk, re
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import tensorflow_hub as hub
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
drive.mount('/content/drive', force_remount = True)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

In [None]:
# 檔案要先儲存在 google drive 再修改路徑
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'latin-1', header = None)
df = df[[5, 0]]
df.columns=['tweet', 'Sentiment']
df.loc[df["Sentiment"] == 4, "Sentiment"] = 1

In [None]:
def preprocess(text):
  text = text.lower()
  text = ''.join([word for word in text if word not in string.punctuation])
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  lemma_words = [lemmatizer.lemmatize(w, pos = 'a') for w in tokens]
  return ' '.join(lemma_words)

df["tweet"] = df["tweet"].apply(preprocess)

### Word2Vec ( Using Gensim Package )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["tweet"], df["Sentiment"], test_size = 0.3)
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size = 100, window = 5, min_count = 5, workers = 4)

In [None]:
def vectorize(sentence):
  words = sentence.split()
  words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
  if len(words_vecs) == 0:
    return np.zeros(100)
  words_vecs = np.array(words_vecs)
  return words_vecs.mean(axis = 0)

X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

In [None]:
clf = LogisticRegression()
history = clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(classification_report(y_test, prediction))

### Bert ( Using Transformers )

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 1)
encoded_data = tokenizer(df['tweet'].tolist(), padding = True, truncation = True, return_tensors = 'tf')
input_ids = np.array(encoded_data['input_ids'])
labels = np.array(df['Sentiment'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(input_ids, labels, test_size = 0.3)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.5)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(32)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits = True)
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
model.fit(train_dataset, epochs = 1, validation_data = valid_dataset)

In [None]:
model.evaluate(test_dataset)