# 0. Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install openai

import pandas as pd
import csv
import os
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import numpy as np



In [3]:
# 配置API密钥
api_key = "sk-hBeAQRKWMQ2SDP89FKl07H0TrJpR9eAzv5JOiDRzbbT3BlbkFJLLIvw8y4m2N2i3F_H3vSpa3rBk58Yj9UdUmflWT5gA"
# 配置模型参数
gpt_model = "gpt-4-1106-preview"
temperature = 0.1
top_p = 0.5
penal = 0.0
max_input_token_length = 4096
# 初始化OpenAI客户端
client = OpenAI(api_key=api_key)

# 1. Functions for Processing Dataset

In [4]:
# 构造函数：加载数据，处理数据
def import_data(file_name):
    with open(file_name, 'r', newline='', encoding='utf-8') as infile: # r表示读取只读文档；newline为空表示保留原文中的换行符，而不进行任何转换；
        content = csv.reader(infile, delimiter=',', quoting=csv.QUOTE_MINIMAL) # quoting参数为csv.QUOTE_MINIMAL用于避免对纯文本字段加引号；
        data = [] # 用于储存数据行；
        columns = None # 用于储存列名；
        for idx, row in enumerate(content): # 接上一步，创建新的dataframe并填充原csv数据集内容；
            if idx == 0:
                columns = row # 第一行为列名；
            else:
                data.append(row) # 其余行为数据；
    dataframe = pd.DataFrame(data, columns=columns) # 使用pandas的DataFrame方法，将数据列表data转换为DataFrame格式，并指定列名为columns；
    # drop certain columns；
    if 'ID' in dataframe.columns:
        dataframe = dataframe.drop(['ID'], axis=1)
    if 'Technique' in dataframe.columns:
        dataframe = dataframe.drop(['Technique'], axis=1)
    if 'Vulnerability' in dataframe.columns:
        dataframe = dataframe.drop(['Vulnerability'], axis=1)
    # 将Manipulative列转换为category类型；
    if 'Manipulative' in dataframe.columns:
        dataframe['Manipulative'] = dataframe['Manipulative'].astype('category')
    return dataframe

In [5]:
# 构造函数：分割数据集
def split_train_test(dataframe, train_ratio, test_ratio, random_state=17):
    # 随机打乱数据集
    df_shuffled = dataframe.sample(frac=1, random_state=random_state).reset_index(drop=True) # frac为1表示打乱整个数据集；
    # 使用train_test_split按比例分割数据集
    train, test = train_test_split(
        df_shuffled,
        train_size=train_ratio,
        test_size=test_ratio,
        stratify=df_shuffled['Manipulative'],
        random_state=random_state
    )
    # 再次打乱train和test数据集
    train = train.sample(frac=1, random_state=random_state).reset_index(drop=True)
    test = test.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return train, test

# 2. Process Dataset

In [6]:
# 加载数据集，处理数据集
dataframe = import_data('/content/drive/MyDrive/COLING2025 MetanlManip Intent/dataset/mentalmanip_con.csv')
total_counts = dataframe['Manipulative'].value_counts()
print("------Total counts in dataset------")
print(total_counts)

------Total counts in dataset------
Manipulative
1    2016
0     899
Name: count, dtype: int64


In [7]:
# 分割数据集
train, test = split_train_test(dataframe, 0.7, 0.3)
print(f"Train samples = {len(train)}, Test samples = {len(test)}")
# 查看train中的 1 和 0 的数量
train_counts = train['Manipulative'].value_counts()
print("\n------Counts in the training set------")
print(train_counts)
# 查看test中的 1 和 0 的数量
test_counts = test['Manipulative'].value_counts()
print("\n------Counts in the test set------")
print(test_counts)

Train samples = 2040, Test samples = 875

------Counts in the training set------
Manipulative
1    1411
0     629
Name: count, dtype: int64

------Counts in the test set------
Manipulative
1    605
0    270
Name: count, dtype: int64


In [8]:
# 将训练集和测试集保存为CSV文件
train.to_csv('/content/drive/MyDrive/COLING2025 MetanlManip Intent/dataset/train.csv', index=False)
test.to_csv('/content/drive/MyDrive/COLING2025 MetanlManip Intent/dataset/test.csv', index=False)