In [1]:
from enum import Enum
import os
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    VotingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
import copy

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

### Change environment when running in different terminal

In [2]:
class EXEC_ENV_ENUM(Enum):
    COLAB = 1
    KAGGLE = 2
    LOCAL = 3

"""
丟到 kaggle 要改的地方
"""
# EXEC_ENV = EXEC_ENV_ENUM.COLAB
# EXEC_ENV = EXEC_ENV_ENUM.KAGGLE
EXEC_ENV = EXEC_ENV_ENUM.LOCAL

#### sorting and combined data

In [3]:
import json
import pandas as pd

# 存储提取数据的列表
tweets_data = []

# 读取 JSON 文件
with open('dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as file:
    for line in file:
        try:
            # 解析 JSON 每行
            tweet = json.loads(line)
            
            # 提取字段
            tweet_id = tweet["_source"]["tweet"].get("tweet_id")
            text = tweet["_source"]["tweet"].get("text")
            hashtags = tweet["_source"]["tweet"].get("hashtags", [])
            crawldate = tweet.get("_crawldate")
            score = tweet.get("_score")
            index = tweet.get("_index")
            doc_type = tweet.get("_type")
            
            # 衍生字段
            hashtag_count = len(hashtags)
            text_length = len(text) if text else 0
            contains_keyword = "<LH>" in text if text else False

            # 将提取的字段存入字典，添加到列表
            tweets_data.append({
                "tweet_id": tweet_id,
                "text": text,
                "hashtags": hashtags,
                "crawldate": crawldate,
                "_score": score,
                "_index": index,
                "_type": doc_type,
                "hashtag_count": hashtag_count,
                "text_length": text_length,
                "contains_keyword": contains_keyword
            })
        except json.JSONDecodeError as e:
            print("JSON decode error:", e)

# 转换为 DataFrame
df = pd.DataFrame(tweets_data)

In [4]:
classify = pd.read_csv('dm-2024-isa-5810-lab-2-homework/data_identification.csv')  # 包含 tweet_id 和 identification
emotion = pd.read_csv('dm-2024-isa-5810-lab-2-homework/emotion.csv')  # 包含 tweet_id 和 emotion
data = pd.merge(df, classify, on='tweet_id', how='inner')
data = pd.merge(data, emotion, on='tweet_id', how='outer')
# data # 將情緒特徵及資料分類欄位加入相同dataframe

In [5]:
# 將 data_identification 中 identification 欄位值為 'train' 的選出來作為訓練集
train_data = data[data['identification'] == 'train']

# 將 identification 欄位值為 'test' 的選出來作為測試集
test_data = data[data['identification'] == 'test']

# 所有情感emotion in emotion.csv
emotions = data['emotion'].unique()


In [6]:
SEED = 42
n_splits = 5  # Statified K-Fold 的 fold 數
LGBM_Params = {
    "learning_rate": 0.046,
    "max_depth": 12,
    "num_leaves": 478,
    "min_data_in_leaf": 13,
    "feature_fraction": 0.893,
    "bagging_fraction": 0.784,
    "bagging_freq": 4,
    "lambda_l1": 10,  # Increased from 6.59
    "lambda_l2": 0.01,  # Increased from 2.68e-06
    # 'device': 'gpu'
}
XGB_Params = {
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,  # Increased from 0.1
    "reg_lambda": 5,  # Increased from 1
    "random_state": SEED,
    # 'tree_method': 'gpu_hist',
}
CatBoost_Params_original = {
    "learning_rate": 0.05,
    "depth": 6,
    "iterations": 200,
    "random_seed": SEED,
    "verbose": 0,
    "l2_leaf_reg": 10,  # Increase this value
    # 'task_type': 'GPU'
}
if EXEC_ENV != EXEC_ENV_ENUM.LOCAL:  # Enable GPU
    LGBM_Params['device'] = 'gpu'
    XGB_Params['tree_method'] = 'gpu_hist'
    CatBoost_Params_original['task_type'] = 'GPU'

In [7]:
import pandas as pd

## save to pickle file
train_data.to_pickle("train_data.pkl") 
test_data.to_pickle("test_data.pkl")

## load a pickle file
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")
# already check the size is 411972

#### Loading pkl files

In [8]:
import pandas as pd

## save to pickle file
train_data.to_pickle("train_data.pkl") 
test_data.to_pickle("test_data.pkl")

## load a pickle file
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")
train_data # already check the size is 411972

Unnamed: 0,tweet_id,text,hashtags,crawldate,_score,_index,_type,hashtag_count,text_length,contains_keyword,identification,emotion
1,0x1c7f10,o m g Shut Up And Dance though #BlackMirror <LH>,[BlackMirror],2015-05-16 10:36:47,242,hashtag_tweets,tweets,1,48,True,train,joy
2,0x1c7f11,On #twitch <LH> on the #Destinybeta #Destiny #...,"[twitch, Destinybeta, Destiny, Destiny2, Desti...",2016-10-15 20:46:37,915,hashtag_tweets,tweets,12,136,True,train,anticipation
5,0x1c7f14,A nice sunny wak this morning not many <LH> ar...,[],2016-07-04 07:22:56,939,hashtag_tweets,tweets,0,126,True,train,joy
6,0x1c7f15,I'm one of those people who love candy corn......,"[Confession, NationalCandyCornDay, CouldEatThe...",2016-04-16 12:53:40,181,hashtag_tweets,tweets,5,135,True,train,joy
7,0x1c7f16,@metmuseum What are these? They look like some...,[],2017-04-22 17:50:28,970,hashtag_tweets,tweets,0,102,True,train,disgust
...,...,...,...,...,...,...,...,...,...,...,...,...
1867529,0x38fe18,@LJPBR @FifthHarmony Um My vote For @FifthHar...,[],2016-12-06 11:10:57,922,hashtag_tweets,tweets,0,83,True,train,sadness
1867530,0x38fe19,Where is #WesHoolahan?! #WALvIRL #COYBIG <LH>,"[WesHoolahan, WALvIRL, COYBIG]",2015-02-01 18:04:28,77,hashtag_tweets,tweets,3,46,True,train,anticipation
1867531,0x38fe1a,@mattmfm Fake news! <LH> propagated by Tumpkin...,"[not, maga]",2016-12-20 17:19:58,25,hashtag_tweets,tweets,2,64,True,train,surprise
1867533,0x38fe1c,..today was brutal ..#Hungover,[],2016-09-13 06:31:27,639,hashtag_tweets,tweets,0,31,False,train,disgust


### preprocessing

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=500, tokenizer=nltk.word_tokenize) 

# apply analyzer to training data
BOW_500.fit(train_data['text'])

X_train = BOW_500.transform(train_data['text'])

y_train = train_data['emotion']

X_test = BOW_500.transform(test_data['text'])

In [10]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [11]:
print(train_data["text"].head())

1     o m g Shut Up And Dance though #BlackMirror <LH>
2    On #twitch <LH> on the #Destinybeta #Destiny #...
5    A nice sunny wak this morning not many <LH> ar...
6    I'm one of those people who love candy corn......
7    @metmuseum What are these? They look like some...
Name: text, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

def TrainML(model_class, train_data, test_data):
    # 確保文本數據是字符串
    train_data["text"] = train_data["text"].astype(str)

    # 文本向量化
    vectorizer = TfidfVectorizer(max_features=500)  # 可以調整 max_features
    X = vectorizer.fit_transform(train_data["text"]).toarray()

    # 標籤編碼
    encoder = LabelEncoder()
    y = encoder.fit_transform(train_data["emotion"])

    """
    使用 Stratified K-Fold 分層抽樣確保各類別比例一致
    """
    skf = StratifiedKFold(n_splits=5)

    # 模型訓練
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # 克隆模型並訓練
        model = clone(model_class)
        model.fit(X_train, y_train)
        print("Validation Score:", model.score(X_val, y_val))

    # 使用整個訓練集進行最終模型訓練
    model_class.fit(X, y)

    # 測試集預測
    test_data["text"] = test_data["text"].astype(str)
    X_test = vectorizer.transform(test_data["text"]).toarray()
    return model_class.predict(X_test)

from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# 初始化基礎模型
Light = LGBMRegressor()
XGBoost = XGBRegressor()
RandomForest = RandomForestRegressor()

# 定義投票回歸模型
voting_model = VotingRegressor(
    estimators=[
        ("lightgbm", Light),
        ("xgboost", XGBoost),
        ("randomforest", RandomForest),
    ]
)

Submission1 = TrainML(voting_model, train_data, test_data)
print("Submission:", Submission1)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.543294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120862
[LightGBM] [Info] Number of data points in the train set: 1164450, number of used features: 500
[LightGBM] [Info] Start training from score 3.765624


### LightGBM

In [None]:
from lightgbm import LGBMClassifier

# 定义 LightGBM 模型
model = LGBMClassifier(n_estimators=300, random_state=42)

# 训练模型
model.fit(X_train, y_train)

# 测试集预测
y_test_pred_lgbm = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.697855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2304
[LightGBM] [Info] Number of data points in the train set: 1455563, number of used features: 500
[LightGBM] [Info] Start training from score -3.597599
[LightGBM] [Info] Start training from score -1.765956
[LightGBM] [Info] Start training from score -2.347948
[LightGBM] [Info] Start training from score -3.124281
[LightGBM] [Info] Start training from score -1.037008
[LightGBM] [Info] Start training from score -2.018196
[LightGBM] [Info] Start training from score -3.396874
[LightGBM] [Info] Start training from score -1.957809


### RandomForest

In [None]:
from catboost import CatBoostClassifier

# 定义 CatBoostClassifier 模型
model = CatBoostClassifier(iterations=300, random_state=42, verbose=0)

# 训练模型
model.fit(X_train, y_train)

# 测试集预测

y_test_pred = model.predict(X_test)

In [None]:

# 测试集预测
y_test_pred_rf = model.predict(X_test)

In [None]:
# 保存预测结果
test_data["emotion"] = y_test_pred_rf
test_data[["tweet_id", "emotion"]].to_csv("test_predictions.csv", index=False)