<a href="https://colab.research.google.com/github/ChuLinh02/AI4LI/blob/main/MentalQLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import librabries
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from scipy.spatial.distance import cdist
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import zscore

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
class DataSelector:
    def __init__(self, model_name: str, data_path: str, output_path: str, k: int, z_score_threshold: float):
        self.model_name = model_name
        self.data_path = data_path
        self.output_path = output_path
        self.k = k
        self.z_score_threshold = z_score_threshold

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id  # Ensure padding token is set

        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.eval()

        self.data = self.load_data()
        self.perplexities = None
        self.filtered_data = None
        self.vector_embeddings = None

    def load_data(self):
        df = pd.read_csv(self.data_path)
        return list(zip(df['post'], df['response']))

    def compute_perplexity(self, instruction: str, output: str) -> float:
        text = f"{instruction} {output}"
        encodings = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.tokenizer.model_max_length,
            padding=True  # Ensure uniform input length
        )

        input_ids = encodings.input_ids
        attention_mask = encodings.attention_mask

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()

        # Mask out padding tokens
        valid_indices = shift_labels != self.tokenizer.pad_token_id

        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),  # Reshape logits
            shift_labels.view(-1)  # Reshape labels
        )

        return torch.exp(loss).item()

    def filter_data(self):
        self.perplexities = np.array([self.compute_perplexity(x, y) for x, y in self.data])
        filtered_indices = np.where(np.abs(zscore(self.perplexities)) < self.z_score_threshold)[0]
        self.filtered_data = [self.data[i] for i in filtered_indices]

    def vectorize_data(self):
        tfidf = TfidfVectorizer()
        text_data = [" ".join(pair) for pair in self.filtered_data]
        self.vector_embeddings = tfidf.fit_transform(text_data).toarray()

    def k_center_greedy(self) -> list:
        first_index = np.argmin(np.abs(self.perplexities - np.mean(self.perplexities)))
        selected_indices = [first_index]
        for _ in range(self.k - 1):
            remaining = np.delete(self.vector_embeddings, selected_indices, axis=0)
            distances = cdist(self.vector_embeddings[selected_indices], remaining, metric="euclidean").min(axis=0)
            new_center = np.argmax(distances)
            selected_indices.append(new_center)
        return selected_indices

    def select_and_save_data(self):
        self.filter_data()
        self.vectorize_data()
        selected_indices = self.k_center_greedy()
        final_selected_data = [self.filtered_data[i] for i in selected_indices]
        selected_df = pd.DataFrame(final_selected_data, columns=['query', 'gpt-3.5-turbo'])
        selected_df.to_csv(self.output_path, index=False)
        print(f"Selected data saved to {self.output_path}")

In [4]:
# Usage
selector = DataSelector(
      model_name="gpt2",
      data_path="/content/gdrive/MyDrive/AI4LI/IMHI dataset/train_data/complete_data/DR/reddit_train.csv",
      output_path="selected_reddit_train_data.csv",
      k=10,
      z_score_threshold=2
  )

selector.select_and_save_data()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Selected data saved to selected_reddit_train_data.csv


In [5]:
# Usage
selector = DataSelector(
      model_name="gpt2",
      data_path="/content/gdrive/MyDrive/AI4LI/IMHI dataset/train_data/complete_data/dreaddit/dreaddit-train.csv",
      output_path="selected_dreaddit_train_data.csv",
      k=10,
      z_score_threshold=2
  )

selector.select_and_save_data()

Selected data saved to selected_dreaddit_train_data.csv
