# 1.1 Load data

### 將所有欄位列入分析列

In [1]:
import json
import pandas as pd

# 存儲提取數據的列表
tweets_data = []

# 讀取 JSON 文件
with open('dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as file:
    for line in file:
        try:
            # 解析 JSON 每行
            tweet = json.loads(line)
            # 提取所需的欄位
            tweet_id = tweet["_source"]["tweet"].get("tweet_id")
            text = tweet["_source"]["tweet"].get("text")
            hashtags = tweet["_source"]["tweet"].get("hashtags", [])
            crawldate = tweet.get("_crawldate")

            # 將提取的欄位存入字典，並添加到列表中
            tweets_data.append({
                "tweet_id": tweet_id,
                "text": text,
                "hashtags": hashtags,
                "crawldate": crawldate
            })
        except json.JSONDecodeError as e:
            print("JSON decode error:", e)

# 將提取的數據轉換為 DataFrame
df = pd.DataFrame(tweets_data)

### 簡化first entry的處理方式

In [2]:
import pandas as pd

# 將提取的 tweets_data 轉換為 DataFrame
df = pd.DataFrame(tweets_data)

# 加載其他數據集
classify = pd.read_csv('dm-2024-isa-5810-lab-2-homework/data_identification.csv')
emotion = pd.read_csv('dm-2024-isa-5810-lab-2-homework/emotion.csv')

# 合併 data_identification 和 emotion 到 df 中
data = df.merge(classify, on='tweet_id', how='outer').merge(emotion, on='tweet_id', how='outer')

# 根據 'identification' 欄位分割訓練和測試集
train_data = data[data['identification'] == 'train']
test_data = data[data['identification'] == 'test']

# 提取所有唯一的情感標籤
emotions = train_data['emotion'].unique()

# 使用字典存儲按情感劃分的訓練集子 DataFrame
train_data_by_emotion = {emotion: train_data[train_data['emotion'] == emotion] for emotion in emotions}

# chceck the shape of testing data
test_data

Unnamed: 0,tweet_id,text,hashtags,crawldate,identification,emotion
2,0x28b412,"Confident of your obedience, I write to you, k...",[bibleverse],2017-12-25 04:39:20,test,
4,0x2de201,"""Trust is not the same as faith. A friend is s...",[],2016-01-08 17:18:59,test,
9,0x218443,When do you have enough ? When are you satisfi...,"[materialism, money, possessions]",2015-09-09 09:22:55,test,
30,0x2939d5,"God woke you up, now chase the day #GodsPlan #...","[GodsPlan, GodsWork]",2015-10-10 14:33:26,test,
33,0x26289a,"In these tough times, who do YOU turn to as yo...",[],2016-10-23 08:49:50,test,
...,...,...,...,...,...,...
1867525,0x2913b4,"""For this is the message that ye heard from th...",[],2016-12-10 18:01:00,test,
1867529,0x2a980e,"""There is a lad here, which hath five barley l...",[],2015-01-04 14:40:55,test,
1867530,0x316b80,When you buy the last 2 tickets remaining for ...,"[mixedfeeling, butimTHATperson]",2015-05-12 12:51:52,test,
1867531,0x29d0cb,I swear all this hard work gone pay off one da...,[],2017-10-02 17:54:04,test,


### 將有hashtags的推文另外處理，將所有hashtags合併成字串加入欄位

In [4]:
# 篩選包含 hashtags 的推文
import pandas as pd
from collections import Counter
# 將含有 hashtags 的資料篩選出來
train_with_hashtags = train_data[train_data['hashtags'].notnull() & (train_data['hashtags'] != "")]
train_without_hashtags = train_data[train_data['hashtags'].isnull() | (train_data['hashtags'] == "")]

test_with_hashtags = test_data[test_data['hashtags'].notnull() & (test_data['hashtags'] != "")]
test_without_hashtags = test_data[test_data['hashtags'].isnull() | (test_data['hashtags'] == "")]

test_with_hashtags
#train_with_hashtags

# Here, the hashtags column is already tokenized

Unnamed: 0,tweet_id,text,hashtags,crawldate,identification,emotion
2,0x28b412,"Confident of your obedience, I write to you, k...",[bibleverse],2017-12-25 04:39:20,test,
4,0x2de201,"""Trust is not the same as faith. A friend is s...",[],2016-01-08 17:18:59,test,
9,0x218443,When do you have enough ? When are you satisfi...,"[materialism, money, possessions]",2015-09-09 09:22:55,test,
30,0x2939d5,"God woke you up, now chase the day #GodsPlan #...","[GodsPlan, GodsWork]",2015-10-10 14:33:26,test,
33,0x26289a,"In these tough times, who do YOU turn to as yo...",[],2016-10-23 08:49:50,test,
...,...,...,...,...,...,...
1867525,0x2913b4,"""For this is the message that ye heard from th...",[],2016-12-10 18:01:00,test,
1867529,0x2a980e,"""There is a lad here, which hath five barley l...",[],2015-01-04 14:40:55,test,
1867530,0x316b80,When you buy the last 2 tickets remaining for ...,"[mixedfeeling, butimTHATperson]",2015-05-12 12:51:52,test,
1867531,0x29d0cb,I swear all this hard work gone pay off one da...,[],2017-10-02 17:54:04,test,


---
# 1.2 Save data  

In [5]:
import pandas as pd
import os

# 指定輸出資料夾
output_folder = 'output_2'

# 檢查資料夾是否存在，如果不存在則創建
os.makedirs(output_folder, exist_ok=True)

# 指定輸出檔案的完整路徑
train_data_path = os.path.join(output_folder, "train_data.pkl")
test_data_path = os.path.join(output_folder, "test_data.pkl")
train_hashtags_path = os.path.join(output_folder, "train_data_with_hashtags.pkl")
test_hashtags_path = os.path.join(output_folder, "test_data_with_hashtags.pkl")

# 將 DataFrame 儲存到指定路徑的 pickle 文件
train_data.to_pickle(train_data_path)
test_data.to_pickle(test_data_path)
train_with_hashtags.to_pickle(train_hashtags_path)
test_with_hashtags.to_pickle(test_hashtags_path)

# 載入保存的 pickle 文件進行驗證
train_data = pd.read_pickle(train_data_path)
test_data = pd.read_pickle(test_data_path)

train_hashtags = pd.read_pickle(train_hashtags_path)
test_hashtags = pd.read_pickle(test_hashtags_path)
train_hashtags


Unnamed: 0,tweet_id,text,hashtags,crawldate,identification,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],2015-05-23 11:42:47,train,anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",2016-01-28 04:52:09,train,sadness
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[],2016-01-24 23:53:05,train,fear
5,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",2015-06-11 04:44:05,train,joy
6,0x2c91a8,Still waiting on those supplies Liscus. <LH>,[],2015-08-18 02:30:07,train,anticipation
...,...,...,...,...,...,...
1867526,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,"[NoWonder, Happy]",2016-12-26 02:44:07,train,joy
1867527,0x38959e,In every circumtance I'd like to be thankful t...,[],2015-04-01 08:14:56,train,joy
1867528,0x2cbca6,there's currently two girls walking around the...,[blessyou],2016-11-17 23:46:22,train,joy
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us...",[],2016-09-02 14:25:06,train,joy


---
# 1.3 Exploratory data analysis (EDA)

### explore data with hashtags
推測hashtag欄位應該也有很多特殊符號（類似emoji），因此決定使用遍歷每個推文，尋找是否有含表情符號之標記(hashtags)  
**經過查證，標記欄位並未包含任何表情符號emoji**

In [5]:
import pandas as pd
import re

# 表情符號的正則表達式模式
emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # 表情符號
                           "\U0001F300-\U0001F5FF"  # 符號和圖形
                           "\U0001F680-\U0001F6FF"  # 運輸和地圖符號
                           "\U0001F1E0-\U0001F1FF"  # 國旗
                           "\U00002700-\U000027BF"  # 其他符號
                           "\U00002600-\U000026FF"  # 其他圖形符號
                           "]+", flags=re.UNICODE)

# 載入 train_hashtags.pkl 文件
train_with_hashtags = pd.read_pickle("output_2/train_data_with_hashtags.pkl")

# 檢查 hashtags 欄位是否包含表情符號，並篩選含有表情符號的行
train_with_hashtags['contains_emoji'] = train_with_hashtags['hashtags'].astype(str).apply(
    lambda x: bool(emoji_pattern.search(x)) if pd.notnull(x) else False
)

# 篩選含有表情符號的部分
emoji_rows = train_with_hashtags[train_with_hashtags['contains_emoji']]

# 查看結果
print(emoji_rows)

Empty DataFrame
Columns: [tweet_id, text, hashtags, crawldate, identification, emotion, contains_emoji]
Index: []


## 2. Feature engineering
    Vectorizer

#### 預處理，因為單一推文可能有多個hashtags 
***list to string***

In [6]:
# 將每個 hashtags 列表轉為單一字符串
#train_hashtags['hashtags'] = train_hashtags['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
#train_hashtags['hashtags']

### TF-IDF

#### for hashtags
評估標記是否有對應特定情緒


### Word2Vec

In [None]:
from gensim.models import Word2Vec

## setting
vector_dim = 1455563
window_size = 5
min_count = 1
training_epochs = 20

## model
word2vec_model = Word2Vec(sentences=training_corpus, 
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_epochs)

# get the corresponding vector of a word
word_vec = word2vec_model.wv['emotion']
word_vec.shape

In [None]:
# 如果 X_train 是一維數組，請重新調整為二維
if X_train.ndim == 1:
    X_train = X_train.reshape(-1, 1)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 假設 word_vec 是 hashtags 的向量表示，train_hashtags['emotion'] 是情緒標籤
X_train = np.array(word_vec)  # 將 word_vec 轉換為數組

# 如果 X_train 是一維數組，請重新調整為二維
if X_train.ndim == 1:
    X_train = X_train.reshape(-1, 1)

y_train = np.array(train_hashtags['emotion'])  # 將情緒標籤轉換為數組
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (100, 1)
y_train shape: (1455563,)


#### prepare training corpus

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

# 假設 train_hashtags['hashtags'] 包含列表中的列表
# 確保 hashtags 欄位是列表
train_hashtags['token_hashtags'] = train_hashtags['hashtags'].apply(lambda x: nltk.word_tokenize(x))
#train_hashtags[['tweet_id','hashtags','token_hashtags']]
## create the training corpus
training_corpus = train_hashtags['token_hashtags'].values
#training_corpus.shape


(1455563,)

---
# 3.Model

In [None]:
## ignore warnings
import warnings
warnings.filterwarnings('ignore')

### BERT

### LogisticRegression

In [None]:
# 初始化並訓練 Logistic Regression 模型
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 初始化並訓練 Logistic Regression 模型
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 使用測試集進行預測
y_pred = model.predict(X_train)

# 評估模型
accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

### Naive Bays

In [None]:
# Naive Bayes for smaller dataset
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Step 1: 特徵提取 (BOW)
vectorizer = CountVectorizer(max_features=500)  # 設置最大特徵數為 500
X_train = vectorizer.fit_transform(train_hashtags["hashtags"])
y_train = train_hashtags["emotion"]

train_hashtags


Unnamed: 0,tweet_id,text,hashtags,crawldate,identification,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",Snapchat,2015-05-23 11:42:47,train,anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",freepress TrumpLegacy CNN,2016-01-28 04:52:09,train,sadness
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,,2016-01-24 23:53:05,train,fear
5,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,authentic LaughOutLoud,2015-06-11 04:44:05,train,joy
6,0x2c91a8,Still waiting on those supplies Liscus. <LH>,,2015-08-18 02:30:07,train,anticipation
...,...,...,...,...,...,...
1867526,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,NoWonder Happy,2016-12-26 02:44:07,train,joy
1867527,0x38959e,In every circumtance I'd like to be thankful t...,,2015-04-01 08:14:56,train,joy
1867528,0x2cbca6,there's currently two girls walking around the...,blessyou,2016-11-17 23:46:22,train,joy
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us...",,2016-09-02 14:25:06,train,joy


In [None]:
# Step 2: 建立和訓練 Naive Bayes 模型
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Step 3: 模型預測
y_train_pred = nb_model.predict(X_train)

# Step 4: 模型性能評估

# 準確率
acc_train = accuracy_score(y_train, y_train_pred)

print(f"訓練集準確率: {acc_train:.2f}")

訓練集準確率: 0.41


---
# 4.Evaluate and sort

### (with Hashtags) TF-IDF + LogisticRegression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 假設 train_hashtags['hashtags'] 包含列表中的列表
train_hashtags['hashtags'] = train_hashtags['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
test_hashtags['hashtags'] = test_hashtags['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# 初始化 TfidfVectorizer，并限制特征数为 1000
tfidf_vect = TfidfVectorizer(max_features=1000)

# 使用 TF-IDF 向量化器进行训练集和测试集的向量化
train_tfidf = tfidf_vect.fit_transform(train_hashtags["hashtags"])
test_tfidf = tfidf_vect.fit_transform(test_hashtags["hashtags"])

# 获取特征名称列表
feature_names_tf = tfidf_vect.get_feature_names_out()
train_tfidf


<1455563x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 536183 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 假設 X 是 hashtags 的向量表示，y 是情緒標籤
X_train = train_tfidf  # 請確保 word_vec 是一個 2D 數組
y_train = train_hashtags['emotion']  # 假設情緒標籤在此欄位

# 初始化並訓練 Logistic Regression 模型
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 使用訓練集進行預測
y_pred = model.predict(X_train)

# 評估模型
accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.4243780585244335
Classification Report:
               precision    recall  f1-score   support

       anger       0.65      0.05      0.10     39867
anticipation       0.68      0.23      0.34    248935
     disgust       0.57      0.12      0.20    139101
        fear       0.65      0.08      0.13     63999
         joy       0.39      0.95      0.55    516017
     sadness       0.61      0.11      0.19    193437
    surprise       0.83      0.08      0.15     48729
       trust       0.68      0.11      0.19    205478

    accuracy                           0.42   1455563
   macro avg       0.63      0.22      0.23   1455563
weighted avg       0.56      0.42      0.34   1455563



In [None]:
# prediction of tf-idf & LogisticRegression
y_pred_tfidfLR = model.predict(test_tfidf)
y_pred_tfidfLR


import pandas as pd
import os
# 確保資料夾存在，若不存在則創建
output_folder = 'output_2'
os.makedirs(output_folder, exist_ok=True)

result_df = pd.DataFrame({
    'id': test_hashtags['tweet_id'],
    'emotion': y_pred_tfidfLR
})

# 將 DataFrame 保存為 Parquet 文件，不包含索引列
file_path = os.path.join(output_folder, "prediction_hashtags.parquet")
result_df.to_parquet("prediction_hashtags.parquet", index=False)

### (all data) Bag of Words + Decision Tree

#### Bag of Words (BOW)
nltk_tokenize

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=500, tokenizer=nltk.word_tokenize) 
train_data_BOW_features_500 = BOW_500.fit_transform(train_data['text'])

## check dimension
train_data_BOW_features_500.shape





(1455563, 500)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# for a classificaiton problem, you need to provide both training & testing data
X_train = BOW_500.transform(train_data['text'])
y_train = train_data['emotion']
X_test = BOW_500.transform(test_data['text'])

print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)

## build DecisionTree model
DT_model = DecisionTreeClassifier(random_state=1)

## training!
DT_model = DT_model.fit(X_train, y_train)

## predict!
y_train_pred = DT_model.predict(X_train)
## accuracy
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

X_train.shape:  (1455563, 500)
y_train.shape:  (1455563,)
training accuracy: 0.97


In [21]:
# prediction of BOW & DecisionTree
y_pred_BOWDT = DT_model.predict(X_test)

In [23]:
# output to parquet
import pandas as pd
import os
# 確保資料夾存在，若不存在則創建
output_folder = 'output_2'
os.makedirs(output_folder, exist_ok=True)

result_df = pd.DataFrame({
    'id': test_hashtags['tweet_id'],
    'emotion': y_pred_BOWDT
})

# 將 DataFrame 保存為 Parquet 文件，不包含索引列
file_path = os.path.join(output_folder, "prediction_all.parquet")
result_df.to_parquet("prediction_all.parquet", index=False)

---
# 5.combined & output

In [None]:
import pandas as pd

# 匯入兩個預測結果
all_data_predictions = pd.read_parquet('prediction_all.parquet')  # 包含所有資料的預測
hashtag_data_predictions = pd.read_parquet('prediction_hashtags.parquet')  # 包含有 hashtags 的預測

# 合併兩者，假設兩者都基於相同的 `id`
combined_predictions = all_data_predictions.copy()


# 使用有 hashtags 的數據部分覆蓋所有數據中的對應部分
# 假設兩個 DataFrame 中的索引一致
combined_predictions.update(hashtag_data_predictions)

print(combined_predictions.head())






# 指定輸出資料夾
output_folder = 'output_2'
# 檢查資料夾是否存在，如果不存在則創建
os.makedirs(output_folder, exist_ok=True)

# 指定輸出檔案的完整路徑
file_path = os.path.join(output_folder, "final_predictions.parquet")

# 將 DataFrame 儲存到指定路徑的 pickle 文件
combined_predictions.to_parquet(file_path)

         id  emotion
0  0x28b412  disgust
1  0x2de201      joy
2  0x218443      joy
3  0x2939d5      joy
4  0x26289a      joy


In [None]:

# 假設您想用 70% 來自有 hashtag 的預測，30% 來自所有資料的預測
weight_hashtag = 0.3
weight_all_data = 0.7
all_data_predictions['emotion'] = pd.to_numeric(all_data_predictions['emotion'], errors='coerce')
hashtag_data_predictions['emotion'] = pd.to_numeric(hashtag_data_predictions['emotion'], errors='coerce')

# 對於有 hashtag 的數據，根據比例合併
for idx in hashtag_data_predictions.index:
    combined_predictions.loc[idx, 'emotion'] = (
        weight_hashtag * hashtag_data_predictions.loc[idx, 'emotion'] +
        weight_all_data * all_data_predictions.loc[idx, 'emotion']
    )
output_folder = 'output_2'
file_path = os.path.join(output_folder, "final_predictions.parquet")
combined_predictions.to_parquet('final_predictions.parquet', index=False)

In [32]:
pd = pd.read_parquet('final_predictions.parquet')
pd

Unnamed: 0,id,emotion
0,0x28b412,
1,0x2de201,
2,0x218443,
3,0x2939d5,
4,0x26289a,
...,...,...
411967,0x2913b4,
411968,0x2a980e,
411969,0x316b80,
411970,0x29d0cb,


In [None]:
# 假設情緒標籤對應的數值
emotion_map = {
    'happy': 0,
    'sad': 1,
    'angry': 2,
    'surprised': 3,
    'neutral': 4
}
# 反向映射，將數值轉回情緒標籤
reverse_emotion_map = {v: k for k, v in emotion_map.items()}