#1. Load Dataset:

In [None]:
!nvidia-smi

Tue May 13 06:00:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             30W /   70W |    2466MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!pip install transformers torch scikit-learn pandas

In [None]:
import re
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
#Pre-train BERT:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from peft import get_peft_model, LoraConfig
#Confidence Score System:
import requests
from Bio import Entrez
from langchain import LLMChain, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
#Gemini Model:
from google import genai
from google.genai import types
import base64
import google.generativeai as genai

(1) Covid Fake News Dataset:

In [None]:
# List of JSON files to process
json_files = [
    'Cleaned_Covid19_Train.json',
    'Cleaned_Covid19_Dev.json',
]
data_dict = {}
# Process each JSON file
for json_file in json_files:
    # Load the dataset
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Prepare a list to hold the processed data
    jsonl_data = []

    # Extract and process each entry
    for entry in data:
        # Extract the id, tweet, and label
        tweet = entry['tweet']
        label = entry['label']

        # Tokenize the tweet
        tokens = re.findall(r'\b\w+\b', tweet)  # Keep only words and numbers
        reconstructed_tweet = ' '.join(tokens)

        # Prepare the JSONL entry with the required structure
        jsonl_entry = {
            "systemInstruction": {
                "role": "assistant",  # Example role, adjust as needed
                "parts": [
                    {
                        "text": "Classification the content is Fake, Real, or Misleading"  # Example instruction, adjust as needed
                    }
                ]
            },
            "contents": [
                {
                    "role": "user",
                    "parts": [
                        {
                            "text": f"TRANSCRIPT: \n{reconstructed_tweet}\n\n LABEL:"
                        }
                    ]
                },
                {
                    "role": "model",
                    "parts": [
                        {
                            "text": label  # The label indicating the model's response
                        }
                    ]
                }
            ]
        }
        jsonl_data.append(jsonl_entry)

   # Write the processed data to a JSONL file
    output_file = json_file.replace('.json', '.jsonl')  # Change the extension to .jsonl
    with open(output_file, 'w') as outfile:
        for entry in jsonl_data:
            json.dump(entry, outfile)
            outfile.write('\n')  # Write each entry on a new line
    print(f"Processed {json_file} and saved to {output_file}.")
    data_dict[json_file] = jsonl_data
# Access the data using the correct keys - the original filenames
covid_train_data = data_dict['Cleaned_Covid19_Train.json']  # Corrected key
covid_dev_data = data_dict['Cleaned_Covid19_Dev.json']  # Corrected key
# Print the first few entries for verification
print(f"First few entries from claims_test_data:\n{covid_train_data[:5]}")


Processed Cleaned_Covid19_Train.json and saved to Cleaned_Covid19_Train.jsonl.
Processed Cleaned_Covid19_Dev.json and saved to Cleaned_Covid19_Dev.jsonl.
First few entries from claims_test_data:
[{'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'Classification the content is Fake, Real, or Misleading'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'TRANSCRIPT: \nThe CDC currently reports 99031 deaths In general the discrepancies in death counts between different sources are small and explicable The death toll stands at roughly 100000 people today\n\n LABEL:'}]}, {'role': 'model', 'parts': [{'text': 'real'}]}]}, {'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'Classification the content is Fake, Real, or Misleading'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'TRANSCRIPT: \nStates reported 1121 deaths a small rise from last Tuesday Southern states reported 640 of those deaths https t co YASGRTT4ux\n\n LABEL:'}]}, {'role': 'model', 'parts': [{'

(2) Health Fact Dataset:

In [None]:
import json
import re
import os

# List of JSON files to process
json_files = [
    'healthfact_traindata.json',
    'cleaned_healthfact_test.json',
    'cleaned_healthfact_dev.json'
]
data_dict = {}
# Process each JSON file
for json_file in json_files:
    # Prepare a list to hold the processed data
    jsonl_data = []
    # Load the dataset
    with open(json_file, 'r') as file:
        # Read each line as a separate JSON object
        for line in file:
            try:
                entry = json.loads(line)
                # Extract the claim, explanation, and label
                claim = entry['claim']
                explanation = entry['explanation']
                label = entry['label']

                # Tokenize the claim
                tokens = re.findall(r'\b\w+\b', claim)  # Keep only words and numbers
                reconstructed_claim = ' '.join(tokens)

                # Prepare the JSONL entry in the required format
                jsonl_entry = {
                    "systemInstruction": {
                        "role": "assistant",  # Example role, adjust as needed
                        "parts": [
                            {
                                "text": "You are a helpful assistant."  # Example instruction, adjust as needed
                            }
                        ]
                    },
                    "contents": [
                        {
                            "role": "user",
                            "parts": [
                                {
                                    "text": f"CLAIM: {reconstructed_claim}\nEXPLANATION: {explanation}\nLABEL: {label}"
                                }
                            ]
                        },
                        {
                            "role": "model",
                            "parts": [
                                {
                                    "text": label  # The label indicating the model's response
                                }
                            ]
                        }
                    ]
                }
                jsonl_data.append(jsonl_entry)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    # Use the correct key to store the data in the dictionary - keep the original filenames as keys
    data_dict[json_file] = jsonl_data
# Access the data using the correct keys - the original filenames
healthfact_train_data = data_dict['healthfact_traindata.json']  # Corrected key
healthfact_test_data = data_dict['cleaned_healthfact_test.json']  # Corrected key
healthfact_dev_data = data_dict['cleaned_healthfact_dev.json']  # Corrected key
# Print the first few entries for verification
print(f"First few entries from healthfact_train_data:\n{healthfact_train_data[:5]}")
# Optionally, write the processed data to JSONL files
for json_file, jsonl_data in data_dict.items():
    output_file = json_file.replace('.json', '.jsonl')  # Change the extension to .jsonl
    with open(output_file, 'w') as outfile:
        for entry in jsonl_data:
            json.dump(entry, outfile)
            outfile.write('\n')  # Write each entry on a new line
    print(f"Processed {json_file} and saved to {output_file}.")

First few entries from healthfact_train_data:
[{'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'You are a helpful assistant.'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'CLAIM: The money the Clinton Foundation took from from foreign governments while Hillary Clinton was secretary of state is clearly illegal The Constitution says you can t take this stuff\nEXPLANATION: "Gingrich said the Clinton Foundation ""took money from from foreign governments while (Hillary Clinton) was secretary of state. It is clearly illegal. … The Constitution says you can’t take this stuff."" A clause in the Constitution does prohibit U.S. officials such as former Secretary of State Hillary Clinton from receiving gifts, or emoluments, from foreign governments. But the gifts in this case were donations from foreign governments that went to the Clinton Foundation, not Hillary Clinton. She was not part of the foundation her husband founded while she was secretary of state. Does that viol

(3) Scifact Dataset:

In [None]:
import json
import re

# List of JSONL files to process
jsonl_files = [
    'dev_3class.jsonl',
    'train_3class.jsonl'
]
data_dict = {}
# Process each JSONL file
for jsonl_file in jsonl_files:
    # Prepare a list to hold the processed data
    processed_data = []

    # Load the dataset
    with open(jsonl_file, 'r') as file:
        for line in file:
            try:
                entry = json.loads(line)

                # Extract the claim, explanation, and label
                claim = entry['claim']
                explanation = entry['evidence_text']
                label = entry['label']

                # Tokenize the claim
                tokens = re.findall(r'\b\w+\b', claim)  # Keep only words and numbers
                reconstructed_claim = ' '.join(tokens)

                # Prepare the JSONL entry in the required format
                jsonl_entry = {
                    "systemInstruction": {
                        "role": "assistant",  # Example role, adjust as needed
                        "parts": [
                            {
                                "text": "You are a helpful assistant."  # Example instruction, adjust as needed
                            }
                        ]
                    },
                    "contents": [
                        {
                            "role": "user",
                            "parts": [
                                {
                                    "text": f"CLAIM: {reconstructed_claim}\nEVIDENCE: {explanation}\nLABEL: {label}"
                                }
                            ]
                        },
                        {
                            "role": "model",
                            "parts": [
                                {
                                    "text": label  # The label indicating the model's response
                                }
                            ]
                        }
                    ]
                }
                # Append the modified entry to the processed data list
                processed_data.append(jsonl_entry)  # Append the processed data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    # Store the processed data in the dictionary
    data_dict[jsonl_file] = processed_data
# Access the data using the correct keys - the original filenames
scifact_train_data = data_dict['train_3class.jsonl']  # Corrected key
scifact_test_data = data_dict['dev_3class.jsonl']  # Corrected key
# Print the first few entries for verification
print(f"First few entries from scifact_train_data:\n{scifact_train_data[:5]}")
# Optionally, write the processed data to new JSONL files
for jsonl_file, processed_data in data_dict.items():
    output_file = jsonl_file.replace('.jsonl', '_processed.jsonl')  # Change the extension to _processed.jsonl
    with open(output_file, 'w') as outfile:
        for entry in processed_data:
            json.dump(entry, outfile)
            outfile.write('\n')  # Write each entry on a new line
    print(f"Processed {jsonl_file} and saved to {output_file}.")

First few entries from scifact_train_data:
[{'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'You are a helpful assistant.'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'CLAIM: 0 dimensional biomaterials lack inductive properties\nEVIDENCE: \nLABEL: Misleading'}]}, {'role': 'model', 'parts': [{'text': 'Misleading'}]}]}, {'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'You are a helpful assistant.'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'CLAIM: 1 in 5 million in UK have abnormal PrP positivity\nEVIDENCE: RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per million population (95% confidence interval 282 to 801 per million).\nLABEL: False'}]}, {'role': 'model', 'parts': [{'text': 'False'}]}]}, {'systemInstruction': {'role': 'assistant', 'parts': [{'text': 'You are a helpful assistant.'}]}, 'contents': [{'role': 'user', 'parts': [{'text': 'CLAIM: 1 1 of colorectal cancer pat

#2. Data Exploration


In [None]:
# Function to explore a dataset
def explore_dataset(data, dataset_name):
    print(f"Exploring dataset: {dataset_name}")
    print(f"Number of entries: {len(data)}")

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(data)

    # Display the first few entries
    print("First few entries:")
    print(df.head())

    # Display basic statistics
    print("\nBasic statistics:")
    print(df.describe(include='all'))

    # Check the distribution of labels (if applicable)
    if 'label' in df.columns:
        label_counts = df['label'].value_counts()
        print("\nLabel distribution:")
        print(label_counts)

        # Plot the label distribution
        label_counts.plot(kind='bar', title='Label Distribution')
        plt.xlabel('Labels')
        plt.ylabel('Counts')
        plt.show()

    print("\n" + "-" * 40 + "\n")

# Explore each dataset
explore_dataset(covid_train_data, "Cleaned Covid19 Train Data")
explore_dataset(healthfact_train_data, "Healthfact Train Data")
explore_dataset(scifact_train_data, "SciFact Train Data")

Exploring dataset: Cleaned Covid19 Train Data
Number of entries: 6420
First few entries:
                                   systemInstruction  \
0  {'role': 'assistant', 'parts': [{'text': 'Clas...   
1  {'role': 'assistant', 'parts': [{'text': 'Clas...   
2  {'role': 'assistant', 'parts': [{'text': 'Clas...   
3  {'role': 'assistant', 'parts': [{'text': 'Clas...   
4  {'role': 'assistant', 'parts': [{'text': 'Clas...   

                                            contents  
0  [{'role': 'user', 'parts': [{'text': 'TRANSCRI...  
1  [{'role': 'user', 'parts': [{'text': 'TRANSCRI...  
2  [{'role': 'user', 'parts': [{'text': 'TRANSCRI...  
3  [{'role': 'user', 'parts': [{'text': 'TRANSCRI...  
4  [{'role': 'user', 'parts': [{'text': 'TRANSCRI...  

Basic statistics:
                                        systemInstruction  \
count                                                6420   
unique                                                  1   
top     {'role': 'assistant', 'parts': [{'

#3. Training Strategy

In [None]:
# Convert datasets to DataFrames for easier manipulation
healthfact_df = pd.DataFrame(healthfact_train_data)
scifact_df = pd.DataFrame(scifact_train_data)

# Combine HealthFact and SciFact datasets for pre-training
combined_pretrain_df = pd.concat([healthfact_df, scifact_df], ignore_index=True)

# Save the combined dataset for pre-training
combined_pretrain_df.to_json('combined_pretrain_data.jsonl', orient='records', lines=True)

# Convert COVID-19 dataset to DataFrame
covid_df = pd.DataFrame(covid_train_data)

# Save the COVID-19 dataset for fine-tuning
covid_df.to_json('covid_finetune_data.jsonl', orient='records', lines=True)

print("Datasets combined and saved for train dataset:")
print("1. Combined Pre-train Data: combined_pretrain_data.jsonl")
print("2. COVID-19 Fine-tune Data: covid_finetune_data.jsonl")

Datasets combined and saved for train dataset:
1. Combined Pre-train Data: combined_pretrain_data.jsonl
2. COVID-19 Fine-tune Data: covid_finetune_data.jsonl


In [None]:
# Convert datasets to DataFrames for easier manipulation
healthfact_df_test = pd.DataFrame(healthfact_test_data)
scifact_df_test = pd.DataFrame(scifact_test_data)

# Combine HealthFact and SciFact datasets for pre-training
combined_pretrain_df_test = pd.concat([healthfact_df_test, scifact_df_test], ignore_index=True)

# Save the combined dataset for pre-training
combined_pretrain_df_test.to_json('combined_pretrain_test_data.jsonl', orient='records', lines=True)

# Convert COVID-19 dataset to DataFrame
covid_df_test = pd.DataFrame(covid_dev_data)

# Save the COVID-19 dataset for fine-tuning
covid_df_test.to_json('covid_finetune_test_data.jsonl', orient='records', lines=True)

print("Datasets combined and saved for Test dataset:")
print("1. Combined Pre-train Data: combined_pretrain_test_data.jsonl")
print("2. COVID-19 Fine-tune Data: covid_finetune_test_data.jsonl")

Datasets combined and saved for Test dataset:
1. Combined Pre-train Data: combined_pretrain_test_data.jsonl
2. COVID-19 Fine-tune Data: covid_finetune_test_data.jsonl


#4. Before Fine Tuning Gemini 2.0 Flash Model Prompt + Label

In [None]:
# Define a fine-tuning function using Gemini API
def generate_response(prompt):
    response = model.generate_content(prompt)
    return response.text

# Example few-shot training prompt
prompt = """
Claim: "6 10 Sky s EdConwaySky explains the latest COVID19 data and government announcement Get more on the coronavirus data here https t co jvGZlSbFjH https t co PygSKXesBg"
"""

response = generate_response(prompt)
print(response)



---


This claim appears to be a tweet or social media post promoting a segment on Sky News with Ed Conway explaining the latest COVID-19 data and government announcement. It also provides links to further information.

**Here's a breakdown of the elements:**

* **"6 10"**: This likely refers to the time the tweet was posted, possibly 6:10 AM or PM.
* **"Sky s"**:  This is likely a shortened form of "Sky News's".
* **"EdConwaySky"**: This is probably the Twitter handle for Ed Conway, who is likely a Sky News correspondent.
* **"explains the latest COVID19 data and government announcement"**: This describes the content of the segment being promoted.
* **"Get more on the coronavirus data here"**: This is a call to action, encouraging viewers to click on the provided links.
* **"https t co jvGZlSbFjH https t co PygSKXesBg"**: These are shortened URLs likely leading to Sky News's website or relevant articles.  It's important to note that link shorteners like "t.co" can hide the true destination of the link.

**Potential Issues and Things to Consider:**

* **Data Accuracy:**  While the claim itself isn't making a specific factual statement, the accuracy of the COVID-19 data presented in the Sky News segment would be dependent on the sources used by Ed Conway.
* **Bias:**  It's important to be aware of potential biases. Sky News, like any news organization, has a perspective. Viewers should critically evaluate the information presented.
* **Outdated Information:**  COVID-19 data and government announcements change rapidly. The information presented in the segment might be outdated by the time you see the tweet.
* **Link Safety:** Always be cautious when clicking on shortened links, especially from unfamiliar sources.  While Sky News is a reputable organization, it's good practice to be vigilant.  You can use a link expander to see the actual URL before clicking.

**In Conclusion:**

The claim itself is a straightforward promotion of a news segment.  However, critical evaluation of the information presented in the segment is still necessary.  Consider the source, potential biases, the date of the information, and always be cautious when clicking on links.


---




#5. Model Initialization

In [None]:
!pip install --upgrade google-genai
!gcloud auth application-default login
!pip install --upgrade google-cloud-aiplatform

In [None]:

from google.colab import auth as google_auth
google_auth.authenticate_user()

import vertexai
from vertexai.generative_models import GenerativeModel
from vertexai.preview.tuning import sft

vertexai.init(project="sit319-25t1-nguyen-ae806d0", location="us-central1")#sit319 because my fund for Google Cloud belong to SIT319

gemini_pro = GenerativeModel("gemini-2.0-flash-lite-001")

sft_tuning_job = sft.train(
    source_model=gemini_pro,
    train_dataset="gs://daftt/Cleaned_Covid19_Train-7.jsonl",
    tuned_model_display_name="covid_tuning",
    epochs=100,
    learning_rate_multiplier=1,
)




---

/usr/local/lib/python3.11/dist-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)

INFO:vertexai.tuning._tuning:Creating SupervisedTuningJob
/usr/local/lib/python3.11/dist-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)

INFO:vertexai.tuning._tuning:SupervisedTuningJob created. Resource name: projects/181085238689/locations/us-central1/tuningJobs/1012173862948831232

INFO:vertexai.tuning._tuning:To use this SupervisedTuningJob in another session:
INFO:vertexai.tuning._tuning:tuning_job = sft.SupervisedTuningJob('projects/181085238689/locations/us-central1/tuningJobs/1012173862948831232')

INFO:vertexai.tuning._tuning:View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/1012173862948831232?project=181085238689



---



#6. Training and Evaluation fine tuning: Gemini 2.0 Flash + BERT

A. Pre-train on HealthFact + SciFact for General fact-checking ability.

In [None]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
# Load the combined pre-training dataset (HealthFact + SciFact)
train_combined_data = pd.read_json('combined_pretrain_data.jsonl', lines=True)
val_combined_data = pd.read_json('combined_pretrain_test_data.jsonl', lines=True)

# Load the COVID-19 fine-tuning dataset
train_covid_data = pd.read_json('covid_finetune_data.jsonl', lines=True)
val_covid_data = pd.read_json('covid_finetune_test_data.jsonl', lines=True)

# Assuming the datasets have 'claim' and 'label' columns
# Extract claims and labels from nested structure for pre-training
train_claims = train_combined_data['contents'].apply(lambda x: x[0]['parts'][0]['text']).tolist()
train_labels = train_combined_data['contents'].apply(lambda x: x[1]['parts'][0]['text']).tolist()
val_claims = val_combined_data['contents'].apply(lambda x: x[0]['parts'][0]['text']).tolist()
val_labels = val_combined_data['contents'].apply(lambda x: x[1]['parts'][0]['text']).tolist()

# Convert string labels to integers
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input data for pre-training
train_encodings = tokenizer(train_claims, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_claims, truncation=True, padding=True, max_length=128)

# Create a dataset class
class ClaimsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for pre-training
train_dataset = ClaimsDataset(train_encodings, train_labels)
val_dataset = ClaimsDataset(val_encodings, val_labels)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))

# Define training arguments for pre-training with validation loss logging
training_args = TrainingArguments(
    output_dir='./results/pretrain',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs/pretrain',
    logging_steps=10,
    eval_strategy="epoch",  # Updated to eval_strategy
)

# Create a Trainer instance for pre-training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        'precision': precision_score(p.label_ids, p.predictions.argmax(-1), average='weighted'),
        'recall': recall_score(p.label_ids, p.predictions.argmax(-1), average='weighted'),
        'f1': f1_score(p.label_ids, p.predictions.argmax(-1), average='weighted'),
        'roc_auc': roc_auc_score(p.label_ids, torch.softmax(torch.tensor(p.predictions), dim=1).numpy(), multi_class='ovr'),
    },
)

# Pre-train the model
trainer.train()
# Save the model and tokenizer
model_save_path = "./my_trained_model"  # Choose your desired save path
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to: {model_save_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhieunguyen23032001[0m ([33mhieunguyen23032001-deakin-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2973,0.28865,0.875817,0.885717,0.875817,0.877511,0.989003
2,0.3579,0.281554,0.890077,0.892447,0.890077,0.889494,0.991119
3,0.208,0.34989,0.89899,0.902847,0.89899,0.90031,0.991414


Model and tokenizer saved to: ./my_trained_model


B. Training and Evaluation Covid Fake News Dataset

In [None]:
# Prepare the training and validation data
# Prepare the training and validation data
train_covid_claims = train_covid_data['contents'].apply(lambda x: x[0]['parts'][0]['text']).tolist()
train_covid_labels = train_covid_data['contents'].apply(lambda x: x[1]['parts'][0]['text']).tolist()
val_covid_claims = val_covid_data['contents'].apply(lambda x: x[0]['parts'][0]['text']).tolist()
val_covid_labels = val_covid_data['contents'].apply(lambda x: x[1]['parts'][0]['text']).tolist()

# Convert string labels to integers
label_encoder = LabelEncoder()
train_covid_labels = label_encoder.fit_transform(train_covid_labels)
val_covid_labels = label_encoder.transform(val_covid_labels)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input data for pre-training
train_encodings = tokenizer(train_covid_claims, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_covid_claims, truncation=True, padding=True, max_length=128)

# Create datasets for pre-training
train_dataset = ClaimsDataset(train_encodings, train_labels)
val_dataset = ClaimsDataset(val_encodings, val_labels)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_covid_labels)))

# Pre-train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.4012,0.396093,0.887701,0.890988,0.887701,0.888816,0.990407
2,0.1776,0.489731,0.888295,0.88907,0.888295,0.887639,0.990609
3,0.1193,0.606078,0.888889,0.892492,0.888889,0.890275,0.990721


TrainOutput(global_step=4152, training_loss=0.15090772818669906, metrics={'train_runtime': 942.5088, 'train_samples_per_second': 35.22, 'train_steps_per_second': 4.405, 'total_flos': 2183571289474560.0, 'train_loss': 0.15090772818669906, 'epoch': 3.0})

#7. Test Prompt and Label

In [None]:
def generate():
  client = genai.Client(
      vertexai=True,
      project="181085238689",
      location="us-central1",
  )

  msg3_text1 = types.Part.from_text(text="""Clearly the Obama administration did not leave any kind of game plan for something like this""")

  model = "projects/181085238689/locations/us-central1/endpoints/5419770989749731328"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text="""Multiple Facebook posts claim that Aussies will be fined if they are found to be talking about conspiracies to do with COVID 19 The posts included a screenshot of a segment from an Australian breakfast television show as evidence for the claim""")
      ]
    ),
    types.Content(
      role="model",
      parts=[
        types.Part.from_text(text=label)
      ]
    ),
    types.Content(
      role="user",
      parts=[
        msg3_text1
      ]
    ),
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 0.8,
    max_output_tokens = 1024,
    response_modalities = ["TEXT"],
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
  )

  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    print(chunk.text, end="")

generate()

In [None]:
/usr/local/lib/python3.11/dist-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds.
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)

fake

# 8. Implement the Confidence Scoring System

In [None]:
!pip install transformers torch requests beautifulsoup4



In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/3.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

Retrieval article

In [None]:
import requests
from bs4 import BeautifulSoup

def retrieve_articles(query):
    """
    Retrieve articles from PubMed based on a query.

    This function uses the PubMed API to search for relevant articles
    based on the provided query and parses the HTML response using BeautifulSoup.
    """
    base_url = "https://pubmed.ncbi.nlm.nih.gov/"
    search_url = f"{base_url}?term={query}"

    response = requests.get(search_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract article titles and summaries (example, you may need to adjust this based on PubMed's HTML structure)
        articles = []
        for article_tag in soup.find_all('div', class_='docsum'):  # Example class for article summaries, adjust as needed
            title = article_tag.find('a', class_='docsum-title').text.strip()  # Example class for title, adjust as needed
            summary = article_tag.find('div', class_='abstract').text.strip()  # Example class for summary, adjust as needed
            articles.append({'title': title, 'summary': summary})

        return articles
    else:
        return None

Implement "Trust Score" calculation

In [None]:
def calculate_trust_score(prediction, retrieved_articles):
    """
    Calculate a trust score based on the LLM's prediction and the retrieved articles.
    The trust score is determined by the number of articles that support or contradict the prediction.
    """
    support_count = 0
    contradict_count = 0

    for article in retrieved_articles:
        if prediction.lower() in article['title'].lower() or prediction.lower() in article['abstract'].lower():
            support_count += 1
        else:
            contradict_count += 1

    total_articles = support_count + contradict_count
    if total_articles == 0:
        return 0.0  # No articles found

    trust_score = support_count / total_articles  # Simple ratio of supporting articles
    return trust_score

Implement LLM Responses

In [None]:
def predict_with_confidence(claim):
    """
    Predict the label for a claim and calculate the trust score based on retrieved articles.
    """
    model.eval()
    with torch.no_grad():
        inputs = prepare_input(claim)
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).item()

    # Convert predicted label to word
    predicted_label_word = label_encoder.inverse_transform([predictions])[0]

    # Retrieve articles related to the claim
    retrieved_articles = retrieve_articles(claim)

    # Calculate the trust score
    trust_score = calculate_trust_score(claim, retrieved_articles)

    # Flag low-confidence responses
    if trust_score < 0.5:  # Example threshold
        print(f"Low confidence for claim: '{claim}'. Trust score: {trust_score:.2f}. Review needed.")
    else:
        print(f"High confidence for claim: '{claim}'. Trust score: {trust_score:.2f}.")

    return predicted_label_word, trust_score

Execute

In [None]:
claim = "Study Vaccine for Breast Ovarian Cancer Has Potential"
predicted_label_word, trust_score = predict_with_confidence(claim)
print(f"Predicted label for the claim '{claim}': '{predicted_label_word}', Trust Score: {trust_score:.2f}")

Low confidence for claim: 'Study Vaccine for Breast Ovarian Cancer Has Potential'. Trust score: 0.00. Review needed.
Predicted label for the claim 'Study Vaccine for Breast Ovarian Cancer Has Potential': 'True', Trust Score: 0.00
