In [41]:
import pandas as pd

df_baseline = pd.read_csv('baseline_chinese_new.csv')
df_baseline_english = pd.read_csv('baseline_english_new.csv')
df_trad = pd.read_csv('traditional_baseline.csv')
df_kd = pd.read_csv('kd_soft.csv')
df_kd_cls = pd.read_csv('kd_cls.csv')


In [42]:
# Add a new column called "exp" in df_kd, set it to "kd_soft"
df_kd['exp'] = 'kd_soft_title+content'
df_kd_cls['exp'] = 'kd_cls_title+content'

In [43]:
# Change the name of all bert-base to bert-base-chinese in df_baseline
df_baseline['exp'] = df_baseline['exp'].replace('bert-base_title', 'bert-base-chinese_title')
df_baseline['exp'] = df_baseline['exp'].replace('bert-base_title+content', 'bert-base-chinese_title+content')


In [44]:
# Concatenate the dataframes
df_combined = pd.concat([df_baseline, df_baseline_english, df_trad, df_kd, df_kd_cls], ignore_index=True)
df_combined.head(5)

Unnamed: 0,fold,exp,best_macro_f1,time_sec,acc,f1_macro,f1_weighted,prec_macro,recall_macro,loss
0,0.0,bert-base-chinese_title,0.659177,78.6,0.671924,0.659177,0.672602,0.655759,0.663533,0.915977
1,0.0,bert-base-chinese_title+content,0.754696,766.9,0.763407,0.754696,0.764848,0.748396,0.764742,0.677751
2,0.0,ckip-bert_title,0.660602,83.9,0.676656,0.656988,0.675883,0.660289,0.654164,0.932246
3,0.0,ckip-bert_title+content,0.745607,770.4,0.749211,0.728313,0.7484,0.731537,0.725525,0.728508
4,0.0,chinese-roberta-wwm-ext _title,0.664365,79.4,0.676656,0.661789,0.677152,0.659125,0.665577,0.890206


In [45]:
# Compute the average of 5 folds for each exp
df_avg = df_combined.groupby('exp').mean().reset_index()
# Drop the 'fold' column as it is no longer needed
df_avg = df_avg.drop(columns=['fold', 'best_macro_f1', "time_sec", "loss"])
df_avg.head()

Unnamed: 0,exp,acc,f1_macro,f1_weighted,prec_macro,recall_macro
0,Logistic Regression_title,0.650346,0.598446,0.632838,0.660712,0.587795
1,Logistic Regression_title+content,0.693936,0.653348,0.682307,0.698193,0.642581
2,POLITICS_title,0.66993,0.651493,0.669003,0.658533,0.647415
3,POLITICS_title+content,0.776058,0.761262,0.774773,0.770212,0.755911
4,SVM_title,0.660766,0.605537,0.6405,0.695427,0.592785


In [46]:
from datetime import datetime
current_date = datetime.now().strftime('%Y-%m-%d')
# 把 df_avg 拆成兩張表，一張是 title 的，一張是 title+content 的
df_title_plus_content = df_avg[df_avg['exp'].str.contains('title\+content')].reset_index(drop=True)
df_title = df_avg[df_avg['exp'].str.contains('title') & ~df_avg['exp'].str.contains('title\+content')].reset_index(drop=True)
# 對裡面的每一列排序
# 順序：Logistic Regression_title SVM_title XGBoost_title bert-base-chinese_title chinese-roberta-wwm-ext_title  ckip-bert_title bert-base_title    roberta-base_title POLITICS_title
order_title = [
    'Logistic Regression_title',
    'SVM_title',
    'XGBoost_title',
    'bert-base-chinese_title',
    'chinese-roberta-wwm-ext _title',
    'ckip-bert_title',
    'bert-base_title',
    'roberta-base_title',
    'POLITICS_title'
]
df_title = df_title.set_index('exp').loc[order_title].reset_index()

order_title_plus_content = [
    'Logistic Regression_title+content',
    'SVM_title+content',
    'XGBoost_title+content',
    'bert-base-chinese_title+content',
    'chinese-roberta-wwm-ext _title+content',
    'ckip-bert_title+content',
    'bert-base_title+content',
    'roberta-base_title+content',
    'POLITICS_title+content',
    'kd_soft_title+content',
    'kd_cls_title+content'
]
df_title_plus_content = df_title_plus_content.set_index('exp').loc[order_title_plus_content].reset_index()

# 儲存成 excel 檔案
with pd.ExcelWriter(f'results_{current_date}.xlsx') as writer:
    df_title.to_excel(writer, sheet_name='title', index=False)
    df_title_plus_content.to_excel(writer, sheet_name='title+content', index=False)