## Featured-based Approach: Text Embedding + Random Forest Classifier

In [2]:
import json
import pandas as pd

In [3]:
train_path = '../Data/team_train.json'
with open(train_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

dev_path = '../Data/team_dev.json'
with open(dev_path, 'r', encoding='utf-8') as file:
    data_dev = json.load(file)

test_path = '../Data/team_test.json'
with open(test_path, 'r', encoding='utf-8') as file:
    data_test = json.load(file)

In [None]:
import pandas as pd

def build_dataframe(data, has_label=True):
    ID, post1, post2, Class = [], [], [], []

    for i in data:
        ID.append(i[0])
        post1.append(i[1])
        post2.append(i[2])
        if has_label:
            Class.append(i[3])

    df = pd.DataFrame({
        'ID': ID,
        'post1': post1,
        'post2': post2
    })

    if has_label:
        df['class'] = Class

    return df

In [8]:
df_train = build_dataframe(data, has_label=True)
df_dev = build_dataframe(data_dev, has_label=True)
df_test = build_dataframe(data_test, has_label=False)

In [11]:
# Data CLeaning
import re
def clean_text(text):
    # Eliminate unnecessary spaces and newlines, inserting a period where the newline originally occurred.
    cleaned_text = re.sub(r'\s*\n+\s*', '。', text)
    return cleaned_text

# Model: Text Embedding 003
Get embeddings for training, validation and testing sets

In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
import numpy as np
import matplotlib as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split

In [None]:
from openai import OpenAI

client = OpenAI(api_key='your OpenAI API key')

def build_embeddings(data, output_path=None, has_label=True, model="text-embedding-3-large"):
    """
    Generate embeddings for the input data and return as a DataFrame.
    
    Parameters
    ----------
    data : list
        Input data in the format (ID, post1, post2, class) or (ID, post1, post2).
    output_path : str or None
        If provided, saves the resulting DataFrame as a CSV file.
    has_label : bool
        True if the data includes class labels (train/dev), False for test set.
    model : str
        The name of the embedding model to use.
    
    Returns
    -------
    df : pandas.DataFrame
        A DataFrame containing the original data along with generated embeddings.
    """
    IDs, posts1, posts2, labels = [], [], [], []
    embeds1, embeds2 = [], []
    embeddings_concat, embeddings_dot, embeddings_mul = [], [], []

    for instance in data:
        ID, p1, p2 = instance[0], instance[1], instance[2]
        IDs.append(ID)
        posts1.append(p1)
        posts2.append(p2)
        if has_label:
            labels.append(instance[3])

        # 產生 embedding
        e1 = client.embeddings.create(input=clean_text(str(p1)), model=model).data[0].embedding
        e2 = client.embeddings.create(input=clean_text(str(p2)), model=model).data[0].embedding
        embeds1.append(e1)
        embeds2.append(e2)

        # 三種組合方式
        embeddings_concat.append(np.concatenate([e1, e2]))

    # 建立 DataFrame
    df = pd.DataFrame({
        "ID": IDs,
        "post1": posts1,
        "post2": posts2,
        "embeds1": embeds1,
        "embeds2": embeds2,
        "embeddings_concat": embeddings_concat,
    })

    if has_label:
        df["class"] = labels

    # 輸出 CSV
    if output_path:
        df.to_csv(output_path, index=False)

    return df


In [None]:
df_train = build_embeddings(data, output_path="../Data/train_embedding.csv", has_label=True)
df_dev   = build_embeddings(data_dev, output_path="../Data/dev_embedding.csv", has_label=True)
df_test  = build_embeddings(data_test, output_path="../Data/test_embedding.csv", has_label=False)

## Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [None]:
X_concat_train = np.array(df_train['embeddings_concat'])
y_train = np.array(df_train['class'])

X_concat_dev = np.array(df_dev['embeddings_concat'])
y_dev = np.array(df_dev['class'])

X_concat_test = np.array(df_test['embeddings_concat'])

### Training and Validation

In [None]:
# 使用拼接后的嵌入训练分类器
#X_train_concat, X_test_concat, y_train_concat, y_test_concat = train_test_split(X_concat, y, test_size=0.2, random_state=42)
train_shuffle, y_shuffle = shuffle(X_concat_train, y_train, random_state=42)
clf_concat = RandomForestClassifier()
# 超參數調優
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf_concat, param_grid=param_grid, cv=KFold(n_splits=5, shuffle=True), n_jobs=-1, verbose=2)
grid_search.fit(train_shuffle, y_shuffle)

# 最佳模型
best_clf = grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
# Training
y_pred_train = clf_concat.predict(X_concat_train)
print("Training:\n", classification_report(df_train['class'], y_pred_train))
# Validation
y_pred_dev = clf_concat.predict(X_concat_dev)
print("Validation:\n", classification_report(df_dev['class'], y_pred_dev))

In [None]:
# save testing result and submit to Kaggle
y_pred_concat = clf_concat.predict(X_concat_test)
test_data = pd.DataFrame()
test_data['y_pred'] = y_pred_concat
test_data.index = df_test['ID']

# save as .csv
output_file = '../Result/RF_predicted.csv'
test_data.to_csv(output_file)

print(f'Results saved to {output_file}')

Results saved to ../Result/predicted_results_optimized.csv
