In [None]:
# Install necessary packages
!pip install transformers sentencepiece scikit-learn

import pandas as pd
import os
import pathlib
import csv
import sentencepiece
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import torch

Successfully installed huggingface-hub-0.16.4 safetensors-0.3.2 sentencepiece-0.1.99 tokenizers-0.13.3 transformers-4.31.0


In [None]:
# Clone the repository
!git clone https://github.com/NUS-IDS/CW-CURE

# Get the path to the cloned repository
path_to_folder = pathlib.Path("CW-CURE/CURE_data")

# Load the data (use only 100 rows from each dataset)
alzheimers_data = pd.read_csv("CW-CURE/CURE_data/alzheimers/train.tsv", sep="\t", quoting=csv.QUOTE_NONE).head(800)
cancer_data = pd.read_csv("CW-CURE/CURE_data/cancer/train.tsv", sep="\t", quoting=csv.QUOTE_NONE).head(800)
diabetes_data = pd.read_csv("CW-CURE/CURE_data/diabetes/train.tsv", sep="\t", quoting=csv.QUOTE_NONE).head(800)
depression_data = pd.read_csv("CW-CURE/CURE_data/depression/train.tsv", sep="\t", quoting=csv.QUOTE_NONE).head(800)

# Combine all datasets into a single dataframe
combined_data = pd.concat([alzheimers_data, cancer_data, diabetes_data, depression_data], ignore_index=True)

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Split the combined dataset into train and test sets (adjust the test_size as needed)
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Store the original labeled training data in a separate variable
labeled_train_data = train_data.copy()

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


Cloning into 'CW-CURE'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 73 (delta 32), reused 59 (delta 24), pack-reused 0[K
Receiving objects: 100% (73/73), 1.05 MiB | 7.43 MiB/s, done.
Resolving deltas: 100% (32/32), done.


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Self-training loop
num_iterations = 1  # Number of self-training iterations
for i in range(num_iterations):
    # Encode the train_data and test_data
    train_tweets = train_data['Tweet'].astype(str).tolist()  # Ensure tweets are strings
    test_tweets = test_data['Tweet'].astype(str).tolist()  # Ensure tweets are strings

    train_encoded = tokenizer.batch_encode_plus(train_tweets, return_tensors='pt', padding=True, truncation=True)
    test_encoded = tokenizer.batch_encode_plus(test_tweets, return_tensors='pt', padding=True, truncation=True)

    # Train the model on the labeled train_data
    with torch.no_grad():
        outputs = model(input_ids=train_encoded['input_ids'], labels=train_encoded['input_ids'])

    # Generate pseudo-labels for the unlabeled test_data using the trained model
    with torch.no_grad():
        outputs_test = model.generate(input_ids=test_encoded['input_ids'])

    # Decode the generated pseudo-labels
    pseudo_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs_test]

    # Convert pseudo-labels to binary values
    pseudo_labels = [1 if label.lower() == 'true' else 0 for label in pseudo_labels]

    # Add the pseudo-labeled data to the training set
    train_data = pd.concat([train_data, pd.DataFrame({'IsCheckworthy': pseudo_labels})], axis=1)




In [None]:

# Calculate metrics on the combined_data (including pseudo-labeled data)
true_labels = combined_data['IsCureClaim'].astype(bool)
predicted_labels = combined_data['IsCheckworthy'].astype(bool)

# Calculate TP, FP, TN, FN
TP = ((true_labels == True) & (predicted_labels == True)).sum()
FP = ((true_labels == False) & (predicted_labels == True)).sum()
TN = ((true_labels == False) & (predicted_labels == False)).sum()
FN = ((true_labels == True) & (predicted_labels == False)).sum()

# Compute precision, recall, and F1 score
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 1.0
Recall: 1.0
F1 Score: 1.0
