數據處理:
我的數據是在筆電(windows)上跑的，這段程式我把訓練和測試資料分開，從留言的_source 欄位中提取內容（ tweet_id、hashtags 和 text），組成新的 DataFrame。並整合data_identification 和 emotion，再把缺實值補unknow(但好像多此一舉)，最後輸出成檔案。

In [None]:
import pandas as pd
import json

# 定義檔案路徑
paths = {
    "split": r"C:\Users\user\Desktop\lab2\data_identification.csv",
    "emotion": r"C:\Users\user\Desktop\lab2\emotion.csv",
    "tweets": r"C:\Users\user\Desktop\lab2\tweets_DM.json",
    "train_output": r"C:\Users\user\Desktop\lab2\processed_train.csv",
    "test_output": r"C:\Users\user\Desktop\lab2\processed_test.csv",
}

def load_csv(path, description):
    """加載 CSV 檔案並檢查"""
    try:
        data = pd.read_csv(path, on_bad_lines="skip")
        print(f"成功載入 {description}: {data.shape}")
        return data
    except Exception as e:
        raise ValueError(f"載入 {description} 出錯: {e}")

def load_json_lines(path):
    """加載 JSON 行格式的檔案"""
    tweets_list = []
    with open(path, 'r') as file:
        for line in file:
            try:
                tweets_list.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"跳過無效的 JSON 行: {line.strip()}")
    print(f"成功載入 tweets_data: {len(tweets_list)} 行")
    return pd.DataFrame(tweets_list)

# 加載資料
data_identification = load_csv(paths["split"], "data_identification")
emotion_data = load_csv(paths["emotion"], "emotion_data")
tweets_data = load_json_lines(paths["tweets"])

# 提取 tweet 資料
tweets_data['_source'] = tweets_data['_source'].apply(lambda x: x['tweet'] if 'tweet' in x else {})
df = pd.DataFrame({
    'tweet_id': tweets_data['_source'].apply(lambda x: x.get('tweet_id', None)),
    'hashtags': tweets_data['_source'].apply(lambda x: x.get('hashtags', [])),
    'text': tweets_data['_source'].apply(lambda x: x.get('text', "")),
})
print(f"成功提取 tweet 資料: {df.shape}")

# 合併資料
df = df.merge(data_identification, on="tweet_id", how="left")
df = df.merge(emotion_data, on="tweet_id", how="left")
print(f"合併資料後的形狀: {df.shape}")

# 處理缺失值
df['emotion'] = df['emotion'].fillna('unknown')

# 分割訓練與測試資料
train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

# 移除多餘欄位，確保與目標格式一致
test_data = test_data[['tweet_id', 'hashtags', 'text', 'identification']]

# 儲存處理後的資料
train_data.to_csv(paths["train_output"], index=False)
test_data.to_csv(paths["test_output"], index=False)
print("成功儲存處理後的訓練資料與測試資料！")

# 檢查結果
print("處理後的訓練資料：")
print(train_data.head())
print("處理後的測試資料：")
print(test_data.head())


這邊採用XGBoost(因為查到他是kaggle常勝軍)，這邊抽樣6000筆資料，採用TFIDF，並且把Train分割成80:20作為訓練及驗證

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

train_path = '/home/dc0206/Desktop/3.9.20/DM/processed_train.csv'
test_path = '/home/dc0206/Desktop/3.9.20/DM/test000.csv'
output_dir = '/home/dc0206/Desktop/3.9.20/DM'

os.makedirs(output_dir, exist_ok=True)

sample_size = 6000  
train_data = pd.read_csv(train_path, engine='python', on_bad_lines='skip').sample(n=sample_size, random_state=42)

train_data['text'] = train_data['text'].fillna("")
train_data['emotion'] = train_data['emotion'].fillna("unknown")

label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['emotion'])

tfidf_vectorizer = TfidfVectorizer(
    max_features=4000,  
    stop_words='english',  
    ngram_range=(1, 2)  
)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label_encoded']


test_data = pd.read_csv(test_path, engine='python', on_bad_lines='skip')
test_data['text'] = test_data['text'].fillna("")
X_test = tfidf_vectorizer.transform(test_data['text'])

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train_split = X_train_split.toarray()
X_val_split = X_val_split.toarray()
X_test = X_test.toarray()

xgb_model = xgb.XGBClassifier(
    tree_method='hist',  
    device='cuda',       
    random_state=42
)


param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=3,
    verbose=2,
    n_jobs=-1
)


grid_search.fit(X_train_split, y_train_split)
print("最佳:", grid_search.best_params_)

optimized_xgb = grid_search.best_estimator_
optimized_xgb.fit(X_train_split, y_train_split)
y_val_pred = optimized_xgb.predict(X_val_split)
val_f1 = f1_score(y_val_split, y_val_pred, average='weighted')
print(f"F1: {val_f1:.4f}")

y_test_pred = optimized_xgb.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_test_pred)

output_file = os.path.join(output_dir, "final_submission_xgb.csv")
test_data['predicted_emotion'] = predicted_labels
test_data[['tweet_id', 'predicted_emotion']].to_csv(output_file, index=False)
print(f"结果已保存: {output_file}")

if 'emotion' in test_data.columns:
    true_labels = label_encoder.transform(test_data['emotion'])
    print("\n class report:")
    print(classification_report(true_labels, y_test_pred, target_names=label_encoder.classes_))

    
    conf_matrix = confusion_matrix(true_labels, y_test_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix for XGBoost")
    conf_matrix_file = os.path.join(output_dir, "confusion_matrix_xgb.png")
    plt.savefig(conf_matrix_file)
    plt.close()
    print(f"混淆: {conf_matrix_file}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  39.9s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  40.5s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  40.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  40.9s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  41.3s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  42.0s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  50.9s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  51.7s
[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  51.9s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  52.7s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time= 1.3min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time= 1.4min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time= 1.4min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.4min
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.4min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.4min
[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  51.2s
[CV] END learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  53.3s
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time= 1.1min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time= 1.1min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, subsample=1.0; total time= 1.1min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time= 1.2min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, subsample=1.0; total time= 1.2min
[CV] END learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 1.9min
[CV] END learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time= 1.9min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, 



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, subsample=1.0; total time= 2.2min
[CV] END learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.5min
[CV] END learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.5min
[CV] END learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0; total time= 1.4min
[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, subsample=1.0; total time= 2.1min
[CV] END learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 1.5min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=  53.9s
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0; total time=  53.4s
[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, subsample=1.0; total time= 2.2min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=  55.1s
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsampl

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.1, max_depth=7, n_estimators=100, subsample=0.8; total time= 1.1min
[CV] END learning_rate=0.1, max_depth=7, n_estimators=100, subsample=0.8; total time= 1.1min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 1.7min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 1.7min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0; total time= 1.6min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0; total time= 1.6min
[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 1.7min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0; total time= 1.6min
[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8; total time= 1.5min


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8; total time= 1.5min
[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=1.0; total time= 1.3min
[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8; total time= 1.4min
[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=1.0; total time=  51.6s
[CV] END learning_rate=0.1, max_depth=7, n_estimators=200, subsample=1.0; total time=  51.3s
最佳: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1: 0.3997


XGBoostError: [02:59:48] /workspace/src/c_api/../common/device_helpers.cuh:393: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory
- Free memory: 12638224384
- Requested memory: 13183104000

Stack trace:
  [bt] (0) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x22dcbc) [0x75b1a9e2dcbc]
  [bt] (1) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x8a9886) [0x75b1aa4a9886]
  [bt] (2) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xe938c) [0x75b1a9ce938c]
  [bt] (3) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x8fb29c) [0x75b1aa4fb29c]
  [bt] (4) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x926d66) [0x75b1aa526d66]
  [bt] (5) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0xca36cb) [0x75b1aa8a36cb]
  [bt] (6) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x57be18) [0x75b1aa17be18]
  [bt] (7) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x57c1e0) [0x75b1aa17c1e0]
  [bt] (8) /home/dc0206/pytorch_env/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x5d0389) [0x75b1aa1d0389]



more rf

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

train_path = '/home/dc0206/Desktop/3.9.20/DM/processed_train.csv'
test_path = '/home/dc0206/Desktop/3.9.20/DM/test000.csv'
output_dir = '/home/dc0206/Desktop/3.9.20/DM'

os.makedirs(output_dir, exist_ok=True)

sample_size = 6000
train_data = pd.read_csv(train_path, engine='python', on_bad_lines='skip').sample(n=sample_size, random_state=42)
test_data = pd.read_csv(test_path, engine='python', on_bad_lines='skip')

train_data['text'] = train_data['text'].fillna("")
train_data['emotion'] = train_data['emotion'].fillna("unknown")

label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['emotion'])

tfidf_vectorizer = TfidfVectorizer(
    max_features=4000,
    stop_words='english',
    ngram_range=(1, 2)
)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label_encoded']

test_data['text'] = test_data['text'].fillna("")
X_test = tfidf_vectorizer.transform(test_data['text'])

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train_split = X_train_split.toarray()
X_val_split = X_val_split.toarray()
X_test = X_test.toarray()

xgb_model = xgb.XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_split, y_train_split)
print("最佳:", grid_search.best_params_)

optimized_xgb = grid_search.best_estimator_
optimized_xgb.fit(X_train_split, y_train_split)

y_val_pred = optimized_xgb.predict(X_val_split)
val_f1 = f1_score(y_val_split, y_val_pred, average='weighted')
print(f"F1: {val_f1:.4f}")

y_test_pred = optimized_xgb.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_test_pred)

test_data['predicted_emotion'] = predicted_labels
test_output_file = os.path.join(output_dir, "test_results_xgb.csv")
test_data[['tweet_id', 'predicted_emotion']].to_csv(test_output_file, index=False)
print(f"結果已保存: {test_output_file}")

if 'emotion' in test_data.columns:
    true_labels = label_encoder.transform(test_data['emotion'])
    class_report = classification_report(true_labels, y_test_pred, target_names=label_encoder.classes_)
    report_file = os.path.join(output_dir, "classification_report_xgb.txt")
    with open(report_file, "w") as f:
        f.write(class_report)
    print(f"class report: {report_file}")

    conf_matrix = confusion_matrix(true_labels, y_test_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix for XGBoost")
    conf_matrix_file = os.path.join(output_dir, "confusion_matrix_xgb.png")
    plt.savefig(conf_matrix_file)
    plt.close()
    print(f"混淆: {conf_matrix_file}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  10.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  10.6s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  10.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  11.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=  11.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=  11.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=  20.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time=  21.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time=  21.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=  22.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=  22.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time=  22.4s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  11.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  12.0s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  12.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=  12.4s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=  30.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=  30.5s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=  31.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=1.0; total time=  31.8s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  12.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=  11.8s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=1.0; total time=  30.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=1.0; total time=  31.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time=  23.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time=  23.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=  23.6s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time=  24.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time=  13.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time=  14.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=  24.3s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=  24.5s


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns


train_path = '/home/dc0206/Desktop/3.9.20/DM/processed_train.csv'
test_path = '/home/dc0206/Desktop/3.9.20/DM/test000.csv'
output_dir = '/home/dc0206/Desktop/3.9.20/DM'

os.makedirs(output_dir, exist_ok=True)

sample_size = 10000  
train_data = pd.read_csv(train_path, engine='python', on_bad_lines='skip').sample(n=sample_size, random_state=42)
test_data = pd.read_csv(test_path, engine='python', on_bad_lines='skip')


train_data['text'] = train_data['text'].fillna("")
train_data['emotion'] = train_data['emotion'].fillna("unknown")


label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['emotion'])

tfidf_vectorizer = TfidfVectorizer(
    max_features=6000,  
    stop_words='english',  
    ngram_range=(1, 2)  
)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label_encoded']


test_data['text'] = test_data['text'].fillna("")
X_test = tfidf_vectorizer.transform(test_data['text'])


X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


X_train_split = X_train_split.toarray()
X_val_split = X_val_split.toarray()
X_test = X_test.toarray()

rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_split, y_train_split)
print("最佳:", grid_search.best_params_)

optimized_rf = grid_search.best_estimator_
optimized_rf.fit(X_train_split, y_train_split)
y_val_pred = optimized_rf.predict(X_val_split)
val_f1 = f1_score(y_val_split, y_val_pred, average='weighted')
print(f"F1: {val_f1:.4f}")

y_test_pred = optimized_rf.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_test_pred)

test_data['predicted_emotion'] = predicted_labels
test_output_file = os.path.join(output_dir, "test_results_rf.csv")
test_data[['tweet_id', 'predicted_emotion']].to_csv(test_output_file, index=False)
print(f"結果已保存: {test_output_file}")

if 'emotion' in test_data.columns:
    true_labels = label_encoder.transform(test_data['emotion'])
    class_report = classification_report(true_labels, y_test_pred, target_names=label_encoder.classes_)
    report_file = os.path.join(output_dir, "classification_report_rf.txt")
    with open(report_file, "w") as f:
        f.write(class_report)
    print(f"分類報告已保存: {report_file}")

    conf_matrix = confusion_matrix(true_labels, y_test_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix for Random Forest")
    conf_matrix_file = os.path.join(output_dir, "confusion_matrix_rf.png")
    plt.savefig(conf_matrix_file)
    plt.close()
    print(f"混淆: {conf_matrix_file}")


開始超參數搜索...
Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.3s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   7.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  11.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total tim



[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time= 2.1min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 4.2min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 4.0min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 5.8min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 1.7min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 1.8min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 6.1min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 1.9min
[CV] END bootstrap=False, max_depth=Non

更多naive_bayes的嘗試用multi是因為有查到他比較能處理不平衡的資料

In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns


train_path = '/home/dc0206/Desktop/3.9.20/DM/processed_train.csv'
test_path = '/home/dc0206/Desktop/3.9.20/DM/test000.csv'
output_dir = '/home/dc0206/Desktop/3.9.20/DM'
os.makedirs(output_dir, exist_ok=True)


sample_size = 6000
train_data = pd.read_csv(train_path, engine='python', on_bad_lines='skip').sample(n=sample_size, random_state=42)
test_data = pd.read_csv(test_path, engine='python', on_bad_lines='skip')
train_data['text'] = train_data['text'].fillna("")
train_data['emotion'] = train_data['emotion'].fillna("unknown")

label_encoder = LabelEncoder()
train_data['label_encoded'] = label_encoder.fit_transform(train_data['emotion'])

tfidf_vectorizer = TfidfVectorizer(
    max_features=4000,  
    stop_words='english',  
    ngram_range=(1, 2)  
)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label_encoded']
test_data['text'] = test_data['text'].fillna("")
X_test = tfidf_vectorizer.transform(test_data['text'])
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
nb_model = MultinomialNB()

nb_model.fit(X_train_split, y_train_split)

y_val_pred = nb_model.predict(X_val_split)
val_f1 = f1_score(y_val_split, y_val_pred, average='weighted')
print(f"F1: {val_f1:.4f}")

y_test_pred = nb_model.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_test_pred)

test_data['predicted_emotion'] = predicted_labels
test_output_file = os.path.join(output_dir, "test_results_nb.csv")
test_data[['tweet_id', 'predicted_emotion']].to_csv(test_output_file, index=False)
print(f"结果已保存: {test_output_file}")


if 'emotion' in test_data.columns:
    true_labels = label_encoder.transform(test_data['emotion'])
    class_report = classification_report(true_labels, y_test_pred, target_names=label_encoder.classes_)
    report_file = os.path.join(output_dir, "classification_report_nb.txt")
    with open(report_file, "w") as f:
        f.write(class_report)
    print(f"class report: {report_file}")

    
    conf_matrix = confusion_matrix(true_labels, y_test_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix for Naive Bayes")
    conf_matrix_file = os.path.join(output_dir, "confusion_matrix_nb.png")
    plt.savefig(conf_matrix_file)
    plt.close()
    print(f"混淆: {conf_matrix_file}")


F1: 0.3588
结果已保存: /home/dc0206/Desktop/3.9.20/DM/test_results_nb.csv
