# Kaggle 競賽程式碼 DM2024 ISA5810 Lab2 Homework
### 安裝和載入必要的的套件

運行環境：Kaggle GPU T4 \times 2

In [None]:
# 安裝必要的庫
!pip install transformers torch xgboost scikit-learn

# 載入套件
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

---
## 資料前處理

### 讀入資料競賽資料

In [2]:
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
f.close()

emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')

### 選擇有用的資料並且建立成 dataframe


In [3]:
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')
train_data = df[df['identification'] == 'train']

In [4]:
train_data

Unnamed: 0,tweet_id,hashtags,text,identification
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train
3,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train
5,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train
6,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train
...,...,...,...,...
1867526,0x321566,"[NoWonder, Happy]",I'm SO HAPPY!!! #NoWonder the name of this sho...,train
1867527,0x38959e,[],In every circumtance I'd like to be thankful t...,train
1867528,0x2cbca6,[blessyou],there's currently two girls walking around the...,train
1867533,0x24faed,[],"Ah, corporate life, where you can date <LH> us...",train


### 處理資料合併問題

In [5]:
train_data = train_data.merge(emotion, on='tweet_id', how='left') # Merge emotion for corresponding tweet_id
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True) # Remove duplication 
train_data_sample = train_data.sample(frac=0.1) # Get sample
y_train_data = train_data_sample['emotion']
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification'], axis=1)
X_train_data = X_train_data['text'] + ' ' + X_train_data['hashtags'].apply(lambda x: ' '.join(x)) # Combine text and hashtags 

---
## 準備訓練節、驗證集、測試集

### 載入 BERT 作為 embeddings models

In [None]:
# 設置 BERT 模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to('cuda')  # 移動到 GPU

# 將文字轉為 bert embeddings 
def get_bert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to('cuda') for key, value in inputs.items()}  # 移動到 GPU
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # 獲取 [CLS] token 的嵌入
        embeddings.append(cls_embedding)
    return np.vstack(embeddings)

### 將非結構資料由 BERT 轉乘結構型資料

In [None]:
# 建立 X, Y
X = get_bert_embeddings(X_train_data.values)
category_to_symbol = {
    'anticipation': 0, "joy": 1, 'fear': 2, 'sadness': 3, 'disgust': 4, 'trust': 5,
           'surprise': 6, 'anger': 7
}
y_train_data = y_train_data.map(category_to_symbol)
y = y_train_data.values


'\n# 建立 X, Y\nX = get_bert_embeddings(X_train_data.values)\ncategory_to_symbol = {\n    \'anticipation\': 0, "joy": 1, \'fear\': 2, \'sadness\': 3, \'disgust\': 4, \'trust\': 5,\n           \'surprise\': 6, \'anger\': 7\n}\ny_train_data = y_train_data.map(category_to_symbol)\ny = y_train_data.values\n'

---
## 訓練

In [None]:
# 划分訓練和測試集以及使用 DMatrix 建立 XGboost 的架構
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# 設定 XGBoost 參數
params = {
   'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'max_depth': 5,
    'eta': 0.1,
    'n_estimators': 100,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'mlogloss',
}

# 訓練 XGboost 
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')], early_stopping_rounds=10)


In [None]:
# 保存模型並且進行預測驗證集
bst.save_model('my_xgb_model.json')  # 可以使用.json或.bin等格式
y_pred = bst.predict(dval)
print(classification_report(y_val, y_pred))

---
## 對測試集進行預測並輸出

In [10]:
loaded_model = xgb.Booster()
loaded_model.load_model('/kaggle/input/xgboost_twitter/scikitlearn/default/1/my_xgb_model.json')  # 載入模型

In [11]:
# Get the test data from df
test_data = df[df['identification'] == 'test'] 
X_test_data = test_data.drop(['tweet_id', 'identification'], axis=1)
X_test_data = X_test_data['text'] + ' ' + X_test_data['hashtags'].apply(lambda x: ' '.join(x))
X_test_embeddings = get_bert_embeddings(X_test_data.values)
dtest = xgb.DMatrix(X_test_embeddings)

100%|██████████| 411972/411972 [53:43<00:00, 127.80it/s]


In [12]:
y_test_pred = loaded_model.predict(dtest)
y_test_pred_series = pd.Series(y_test_pred)
category_to_symbol = {
    'anticipation': 0, "joy": 1, 'fear': 2, 'sadness': 3, 'disgust': 4, 'trust': 5,
           'surprise': 6, 'anger': 7
}
# 創建反向映射字典
symbol_to_category = {v: k for k, v in category_to_symbol.items()}
# 轉換成原先的標籤
y_test_pred_series = y_test_pred_series.map(symbol_to_category)

In [36]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'].values,
    'emotion': y_test_pred_series.values
})
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [37]:
submission.head()

Unnamed: 0,id,emotion
0,0x28b412,anticipation
1,0x2de201,joy
2,0x218443,joy
3,0x2939d5,anticipation
4,0x26289a,trust
