In [None]:
from google.colab import files, drive
drive.mount("/content/drive")

Mounted at /content/drive


#Creating Claims to pass into models with Hugging Face Summarizer

In [None]:
!pip install python-docx
!pip install transformers

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/239.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [None]:
from docx import Document
from transformers import pipeline
from transformers import AutoTokenizer
import os

In [None]:
# create the summarizer based on the model fine-tuned on samsum
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum") #https://huggingface.co/philschmid/bart-large-cnn-samsum
# Load the tokenizer for the specific model
tokenizer = AutoTokenizer.from_pretrained('philschmid/flan-t5-base-samsum')  # Replace 'model_name' with your model's name

In [None]:
def read_docx_files_in_directory_HF(directory_path, out_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    perform neccesary operations, and then write to new directory.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.\
    out_path (str): The path to the directory where the .txt files will be saved.

    Returns:
    None
    """
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            try:
                doc = Document(file_path) # load in doc from file path
                context = [] # the conversaiontal context (first part)
                convo = [] # the actual conversation that is deceptive or truthful
                i = 0 # helper variable to skip over instructions
                hit = False # once hit will start processing actual convo and not just context
                text_temp = '' # summarized context. Defining here so it is accesible outside if

                for line in doc.paragraphs:
                  if i == 0: # this will make sure it skips over the first line
                    i += 1
                    continue
                  if line.text.startswith('Here is the conversation between'):
                    hit = True
                    text_temp = '\n'.join(context)
                    # Tokenize the text
                    tokens = tokenizer.tokenize(text_temp)

                    # Crop to get the last 512 tokens (model can only take a max of 512 tokens)
                    last_512_tokens = tokens[-512:]

                    # Convert the tokens back to text
                    cropped_text = tokenizer.convert_tokens_to_string(last_512_tokens)
                    # max lenght of summarization is set to only 100 because so much content and not all is very relevant
                    # also because was getting errors where the summarization combined with actual convo was more than 512 tokens.
                    text_temp = summarizer(cropped_text, max_length = 100)[0]['summary_text'] #the summarization of context convo
                    hit = True
                    continue
                  if hit:
                    convo.append(line.text)
                  context.append(line.text)

                text_final = '\n'.join(convo)
                text_final = text_temp + "\n\n" + text_final
                summarized_text = summarizer(text_final)[0]['summary_text']

                filename = filename[:-5] + ".txt"
                file_path = os.path.join(out_path, filename)
                print(file_path)

                with open(file_path, 'w') as file:
                  file.write(summarized_text)

            except IOError as e:
                print(f"Error reading file {filename}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred with file {filename}: {e}")

In [None]:
deceptive_dir_input = r"/content/drive/MyDrive/RLTdataset(in use)/RLT_DeceptiveText"
deceptive_dir_output = r"/content/drive/MyDrive/Claims/Hugging_Face/Deceptive"
read_docx_files_in_directory_HF(deceptive_dir_input, deceptive_dir_output)

In [None]:
truthful_dir_input = r"/content/drive/MyDrive/RLTdataset(in use)/RLT_TruthfulText"
truthful_dir_output = r"/content/drive/MyDrive/Claims/Hugging_Face/Truthful"
read_docx_files_in_directory_HF(truthful_dir_input, truthful_dir_output)

#Creating Claims to pass into models with GPT-4

In [None]:
!pip install -q openai
!pip install python-docx

In [None]:
from openai import OpenAI
from docx import Document
import os
import json

In [None]:
client = OpenAI(api_key="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")

In [None]:
def read_docx_files_in_directory_GPT(directory_path, out_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    perform neccesary operations, and then write to new directory.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.\
    out_path (str): The path to the directory where the .txt files will be saved.

    Returns:
    None
    """
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'): #and not os.path.exists(os.path.join('out_path', filename)):
            file_path = os.path.join(directory_path, filename)
            try:
                doc = Document(file_path)
                full_text = []
                skip_next = False
                for line in doc.paragraphs:
                    line_text = line.text
                    if skip_next:
                        skip_next = False
                        line_text = '''Summarize this conversation and articulate it as a claim made by the individual speaking to the attorney/judge:\n'''
                    if line_text.startswith('Here is the conversation between'):
                        line_text = line_text.split("judge/attorney")[0] + "judge/attorney that you are supposed to analyze."
                        skip_next = True
                    full_text.append(line_text)
                # Join all text elements into a single string
                prompt = '\n'.join(full_text)
                output = call_gpt('gpt-4-0125-preview', prompt)

                filename = filename[:-5] + ".txt"
                file_path = os.path.join(out_path, filename)

                with open(file_path, 'w') as file:
                  file.write(output)

            except IOError as e:
                print(f"Error reading file {filename}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred with file {filename}: {e}")

In [None]:
system_spec = '''
You extract the claim/claims made by a person when they are conversing with a judge/attorney.
Do not extract claims from the "conversational context". Only use this information to add context to the claims extracted
in the part of the conversation you are explicitly asked to analyze.
You will be concise and detail oriented in your summaries. They should not be more than 4 sentences long and should make a clear claim.
Additionally, they should not have the word "claim" in them, and should instead be stated as if they were a fact.
Here is an example of a claim extracted from a conversation between Bob and an attorney/judge:

Prompt:
"
Conversational Context:
Judge: Please state your name for the record
Bob: Bob Brown
Judge: What was your relation to Dylan Brown
Bob: He was my brother

Conversation:
Judge: Where were you at the time Dylan died?
Bob: I was at my office working.
Judge: Did you kill Dylan?
Bob: No sir, I did not.
Judge: Did you know Dylan was dead before you arrived at the house?
Bob: No sir, I did not know Dylan had died until I got home and saw his dead body laying on the floor.
"

Output:"Bob Brown did not kill Dylan Brown (his brother). He did not know Dylan had died until he got home and saw Dylan's dead body lying on the floor"
'''
#use system fingerprint for better understanding
def call_gpt(mdl, prompt):
    response = client.chat.completions.create(
        model=mdl,
        temperature = 0, # set low so that is it deterministic. Maybe to 0 (but prob not cus you want 3 dif responses to choose from)
        messages=[
            {"role": "system", "content": system_spec},
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content

In [None]:
deceptive_dir_input = r"/content/drive/MyDrive/RLTdataset(in use)/RLT_DeceptiveText"
deceptive_dir_output = r"/content/drive/MyDrive/Claims/GPT_4/Deceptive"
read_docx_files_in_directory_GPT(deceptive_dir_input, deceptive_dir_output)

In [None]:
truthful_dir_input = r"/content/drive/MyDrive/RLTdataset(in use)/RLT_TruthfulText"
truthful_dir_output = r"/content/drive/MyDrive/Claims/GPT_4/Truthful"
read_docx_files_in_directory_GPT(truthful_dir_input, truthful_dir_output)

#Condensing Case Facts to 400 tokens. Fact-Verification Models have max of 512 tokens. (Method 1)

In [None]:
!pip install -q openai
!pip install python-docx

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [None]:
from openai import OpenAI
from docx import Document
import os
import json

In [None]:
client = OpenAI(api_key="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")

In [None]:
def read_case_files_in_directory_GPT(directory_path, out_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    perform neccesary operations, and then write to new directory.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.\
    out_path (str): The path to the directory where the .txt files will be saved.

    Returns:
    None
    """
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'): #and not os.path.exists(os.path.join('out_path', filename)):
            file_path = os.path.join(directory_path, filename)
            try:
                doc = Document(file_path)
                full_text = []
                for line in doc.paragraphs:
                    line_text = line.text
                    full_text.append(line_text)
                # Join all text elements into a single string
                prompt = '\n'.join(full_text)
                prompt = '''Summarize the facts from this case:"\n''' + prompt + '''\n"'''
                output = call_gpt('gpt-4-0125-preview', prompt)

                filename = filename[:-5] + ".txt"
                file_path = os.path.join(out_path, filename)

                with open(file_path, 'w') as file:
                  file.write(output)

            except IOError as e:
                print(f"Error reading file {filename}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred with file {filename}: {e}")

In [None]:
system_spec = '''
You summarize case facts, paying special attention to details and hard facts from the case.
You will be concise in your summarizations, not exceeding 400 tokens but not giving an output that is less than 350 tokens unless the input itself is less than 350 tokens.
'''
#use system fingerprint for better understanding
def call_gpt(mdl, prompt):
    response = client.chat.completions.create(
        model=mdl,
        max_tokens=400,
        temperature = 0, # set low so that is it deterministic. Maybe to 0 (but prob not cus you want 3 dif responses to choose from)
        messages=[
            {"role": "system", "content": system_spec},
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content

In [None]:
read_case_files_in_directory_GPT(r"/content/drive/MyDrive/RLTdataset(in use)/RLTCaseFacts", r"/content/drive/MyDrive/Claims/Case_Facts(Condensed)")

#Sending Claims to HF Fact Verification Models

In [None]:
#After both datasets have already been created
!pip install python-docx
!pip install transformers
!pip install torch

#!pip install fact_checking #gpt_2 model: https://huggingface.co/fractalego/fact-checking?text=Once+upon+a+time%2C+and+my+main+character+%28as+well+as+Charles+Blue+Moore%29+from+the+DC+Comics+character+Batman.%0A%0A%0AClaim%3A

In [None]:
#General Imports
from docx import Document
from transformers import pipeline
from transformers import AutoTokenizer
import os
import pandas as pd

#gpt_2 model

from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
)

#from fact_checking import FactChecker

#RoBERTa model: https://huggingface.co/Dzeniks/roberta-fact-check?text=Ryan+Burns+and+Jody+Arias+had+pleasant+conversations+discussing+shared+interests+in+books+like+%22Atlas+Shrugged%2C%22+%22Think+and+Grow+Rich%2C%22+and+%22How+to+Win+Friends+and+Influence+People.%22+They+connected+over+common+interests%2C+and+Ryan+found+Jody%27s+entrepreneurial+spirit+attractive.+Ryan+was+interested+in+getting+to+know+Jody+better+and+potentially+dating+her+due+to+their+compatibility+and+engaging+conversations.

import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
from google.colab import files, drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
file_path = "/content/drive/My Drive/RLTdataset(in use)/Deceptive_Cases.xlsx"
deceptive = pd.read_excel(file_path)
print(deceptive)

    Case #                                          Case Name
0        1                                 NC vs Amanda Hayes
1        2                                 NC vs Amanda Hayes
2        3                                 NC vs Amanda Hayes
3        4                                 NC vs Amanda Hayes
4        5                                 NC vs Amanda Hayes
5        6                                 NC vs Amanda Hayes
6        7                                 NC vs Amanda Hayes
7        8                                 NC vs Amanda Hayes
8        9                                 NC vs Amanda Hayes
9       10                                 NC vs Amanda Hayes
10      11                                 NC vs Amanda Hayes
11      12                                 NC vs Amanda Hayes
12      13                                 NC vs Amanda Hayes
13      14                                   AZ vs Jodi Arias
14      15                                   AZ vs Jodi Arias
15      

In [None]:
file_path = "/content/drive/My Drive/RLTdataset(in use)/Truthful_Cases.xlsx"
truthful = pd.read_excel(file_path)
print(truthful)

    Case #                            Case Name
0        3              GA vs Andrea Sneiderman
1        4              GA vs Andrea Sneiderman
2        5              GA vs Andrea Sneiderman
3        6              GA vs Andrea Sneiderman
4        7              GA vs Andrea Sneiderman
5        8                     AZ vs Jodi Arias
6        9                     AZ vs Jodi Arias
7       10                     AZ vs Jodi Arias
8       11                     AZ vs Jodi Arias
9       12                     AZ vs Jodi Arias
10      13                     AZ vs Jodi Arias
11      14                     AZ vs Jodi Arias
12      15                     AZ vs Jodi Arias
13      27               UT vs. Martin MacNeill
14      29                 FL v. Bessman Okafor
15      30               Shooting of James Boyd
16      31                 FL v. Bessman Okafor
17      32                 FL v. Bessman Okafor
18      33                 FL v. Bessman Okafor
19      34                 FL v. Bessman

In [None]:
def Find_Case_Facts(directory_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    and stores the filename and content in a dictionary.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.

    Returns:
    dict: A dictionary with filenames as keys and file contents as values.
    """
    files_content_dict = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as file:
              claim = file.read()
              case_name = filename[:-4]
              files_content_dict[case_name] = claim
    return files_content_dict

In [None]:
def Find_Case_Facts(directory_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    and stores the filename and content in a dictionary.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.

    Returns:
    dict: A dictionary with filenames as keys and file contents as values.
    """
    files_content_dict = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as file:
              claim = file.read()
              case_name = filename[:-4]
              files_content_dict[case_name] = claim
    return files_content_dict

In [None]:
def Find_Case_Facts(directory_path):
    """
    Iterates through a specified directory, reads the content of each .docx file,
    and stores the filename and content in a dictionary.

    Parameters:
    directory_path (str): The path to the directory containing the .docx files.

    Returns:
    dict: A dictionary with filenames as keys and file contents as values.
    """
    files_content_dict = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            try:
                doc = Document(file_path)
                full_text = []
                for para in doc.paragraphs:
                    full_text.append(para.text)
                # Join all text elements into a single string
                dic_index = filename[:-5]
                files_content_dict[dic_index] = '\n'.join(full_text)
            except IOError as e:
                print(f"Error reading file {filename}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred with file {filename}: {e}")
    return files_content_dict

In [None]:
case_facts_path = r'/content/drive/MyDrive/RLTdataset(in use)/RLTCaseFacts'
case_facts = Find_Case_Facts(case_facts_path)

In [None]:
def find_row_by_case_number(df, case_number):
    #Finds the row in the DataFrame where the specified case number is located.

    # Filter the DataFrame for rows where the "Case #" column matches the case_number
    filtered_df = df[df['Case #'] == case_number]

    # Check if the filtered DataFrame is empty (i.e., no match found)
    if filtered_df.empty:
        print(f"No row found with Case # {case_number}.")
        return None
    else:
        # Return the matching row(s)
        return filtered_df

In [None]:
def read_txt_file_deceptive(file_path, directory):
    """For each deceptive file, give the directory, it will read the claim and find the case facts (evidence)
    """
    full_file_path = os.path.join(directory, file_path)
    if full_file_path.endswith('.txt'):
          with open(full_file_path, 'r') as file:
              claim = file.read()
              file_name = file_path[:-4]
              filtered_df = find_row_by_case_number(deceptive,int(file_name))
              facts = case_facts[filtered_df['Case Name'].values[0]]



    return claim, facts

In [None]:
def read_txt_file_truthful(file_path, directory):
    """For each truthful file, give the directory, it will read the claim and find the case facts (evidence)
    """
    full_file_path = os.path.join(directory, file_path)
    if full_file_path.endswith('.txt'):
          with open(full_file_path, 'r') as file:
              claim = file.read()
              file_name = file_path[:-4]
              filtered_df = find_row_by_case_number(truthful,int(file_name))
              facts = case_facts[filtered_df['Case Name'].values[0]]



    return claim, facts

In [None]:
for c in case_facts.values():
  print(c)

Defendant: Marissa Devault
Victim(s): Dale Harrell
Court Decision: Convicted of first-degree murder for bludgeoning her husband to death with a hammer and sentenced to life in prison​.
Case facts:

The murder of Dale Harrell (December 18, 1974 – February 9, 2009) occurred after he was fatally attacked on January 14, 2009, by his wife Marissa-Suzanne "Reese" DeVault (born November 6, 1977) in Maricopa County, Arizona. Her trial made national and global headlines. The case was noted as being very similar to that of the murder of Travis Alexander by Jodi Arias, with whom DeVault was in contact and whose murder trial occurred in the same courthouse one year earlier. Though she faced the possibility of a death penalty for her crime, DeVault was sentenced to life in prison. She is imprisoned at Perryville within the Arizona Department of Corrections.
Murder and investigation
On January 14, 2009, DeVault entered the master bedroom that she and Harrell shared at their home in Gilbert, Arizona.

RoBERTa Model

In [None]:
'''# Setting Up FilePaths (testing)
GPT_4_Deceptive_Path = '/content/drive/MyDrive/Claims/GPT_4/Deceptive'
GPT_4_Truthful_Path = '/content/drive/MyDrive/Claims/GPT_4/Truthful'
file_name = '001.txt'
claim, evidence = read_txt_file_deceptive(file_name, GPT_4_Deceptive_Path)

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('Dzeniks/roberta-fact-check')
model = RobertaForSequenceClassification.from_pretrained('Dzeniks/roberta-fact-check')

# Define the claim with evidence to classify
#claim = "Albert Einstein work in the field of computer science"
#evidence = "Albert Einstein was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time."

# Tokenize the claim with evidence
x = tokenizer.encode_plus(claim, evidence, return_tensors="pt")

model.eval()
with torch.no_grad():
  prediction = model(**x)

#label = torch.argmax(outputs[0]).item()

#print(f"Label: {label}")
label = torch.argmax(prediction.logits).item()
print (prediction)
print(prediction.logits)
print(torch.argmax(prediction.logits))
print(torch.argmax(prediction.logits).item())
# Assuming the model's output labels are binary (0 or 1), where 0 could mean 'False' and 1 could mean 'True'.
# The interpretation of labels should be adjusted based on the actual model documentation.
if label == 0:
    print("Label: False")
elif label == 1:
    print("Label: True")'''

In [None]:
# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('Dzeniks/roberta-fact-check')
model = RobertaForSequenceClassification.from_pretrained('Dzeniks/roberta-fact-check')

In [None]:
#GPT 4 Deceptive
import os

# Specify the directory to iterate over
directory_path = '/content/drive/MyDrive/Claims/Hugging_Face/Deceptive'

# Iterate over all the entries in the directory
truthful_count = 0
deceptive_count = 0
for entry in os.listdir(directory_path):
      claim, facts = read_txt_file_deceptive(entry, directory_path)
      x = tokenizer.encode_plus(claim, facts, return_tensors="pt")

      model.eval()
      with torch.no_grad():
        prediction = model(**x)

      label = torch.argmax(prediction.logits).item()

      # Model has two labels label 0 (supports) and label 1 (refutes)
      # For our purposes, 0 = Truthful and 1 = deceptive

      if label == 0:
          truthful_count += 1
      elif label == 1:
          deceptive_count += 1



In [None]:
print(truthful_count)
print(deceptive_count)

5
38


In [None]:
#GPT 4 Ttuthful
import os

# Specify the directory to iterate over
directory_path = '/content/drive/MyDrive/Claims/GPT_4/Truthful'

# Iterate over all the entries in the directory
truthful_count = 0
deceptive_count = 0
for entry in os.listdir(directory_path):
      claim, facts = read_txt_file_truthful(entry, directory_path)
      x = tokenizer.encode_plus(claim, facts, return_tensors="pt")
      print()

      model.eval()
      with torch.no_grad():
        prediction = model(**x)

      label = torch.argmax(prediction.logits).item()

      # Model has two labels label 0 (supports) and label 1 (refutes)
      # For our purposes, 0 = Truthful and 1 = deceptive

      if label == 0:
          truthful_count += 1
      elif label == 1:
          deceptive_count += 1
















































In [None]:
print(truthful_count)
print(deceptive_count)

2
41


In [None]:
len(x['input_ids'][0])

1073

In [None]:
for i in x['input_ids'][0]:
  print(i)

GPT 2 Model

In [None]:
!pip install transformers

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="fractalego/fact-checking")

claim = "The moon is made of cheese."
evidence = "The moon is a large, rocky satellite orbiting the Earth."

input_text = f"Claim: {claim} Evidence: {evidence}"

result = pipe(input_text)
print(result)

#Using Embeddings to Extract Evidence (Method 2 Evidence)

In [None]:
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install python-docx

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import files, drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def chunk_text(text, chunk_size=400):
    # Tokenize input text to token IDs, avoiding special tokens
    tokens = tokenizer.encode(text, add_special_tokens=False)

    # Chunk token IDs
    chunked_token_ids = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    # Decode each chunk back to text, cleaning up tokenization spaces
    decoded_chunks = [tokenizer.decode(chunk_ids, clean_up_tokenization_spaces=True) for chunk_ids in chunked_token_ids]

    return decoded_chunks

def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Use the last hidden state mean as the sentence embedding
    return outputs.last_hidden_state.mean(dim=1)

def find_most_relevant_chunk(document, claim):
    chunks = chunk_text(document, chunk_size=400)
    claim_embedding = get_embedding(claim)

    highest_similarity = -1
    most_relevant_chunk = None

    for chunk in chunks:
        chunk_embedding = get_embedding(chunk)
        # Convert embeddings to numpy arrays for cosine_similarity
        similarity = cosine_similarity(claim_embedding.detach().numpy(), chunk_embedding.detach().numpy())[0][0]

        if similarity > highest_similarity:
            highest_similarity = similarity
            most_relevant_chunk = chunk

    return most_relevant_chunk, highest_similarity

In [None]:
# Load the tokenizer and model
fact_tokenizer = RobertaTokenizer.from_pretrained('Dzeniks/roberta-fact-check')
fact_model = RobertaForSequenceClassification.from_pretrained('Dzeniks/roberta-fact-check')

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
#GPT 4 Deceptive
import os

# Specify the directory to iterate over
directory_path = '/content/drive/MyDrive/Claims/GPT_4/Deceptive'

# Iterate over all the entries in the directory
truthful_count = 0
deceptive_count = 0
for entry in os.listdir(directory_path):
      claim, facts = read_txt_file_deceptive(entry, directory_path)
      most_relevant_chunk, highest_similarity = find_most_relevant_chunk(facts, claim)
      #print(facts)

      x = fact_tokenizer.encode_plus(claim, most_relevant_chunk, return_tensors="pt")

      fact_model.eval()
      with torch.no_grad():
        prediction = fact_model(**x)

      label = torch.argmax(prediction.logits).item()

      # Model has two labels label 0 (supports) and label 1 (refutes)
      # For our purposes, 0 = Truthful and 1 = deceptive

      if label == 0:
          truthful_count += 1
      elif label == 1:
          deceptive_count += 1



In [None]:
print(truthful_count)
print(deceptive_count)

3
40


In [None]:
#GPT 4 Deceptive
import os

# Specify the directory to iterate over
directory_path = '/content/drive/MyDrive/Claims/GPT_4/Truthful'

# Iterate over all the entries in the directory
truthful_count = 0
deceptive_count = 0
for entry in os.listdir(directory_path):
      claim, facts = read_txt_file_truthful(entry, directory_path)
      most_relevant_chunk, highest_similarity = find_most_relevant_chunk(facts, claim)
      #print(facts)

      x = fact_tokenizer.encode_plus(claim, most_relevant_chunk, return_tensors="pt")

      fact_model.eval()
      with torch.no_grad():
        prediction = fact_model(**x)

      label = torch.argmax(prediction.logits).item()

      # Model has two labels label 0 (supports) and label 1 (refutes)
      # For our purposes, 0 = Truthful and 1 = deceptive

      if label == 0:
          truthful_count += 1
      elif label == 1:
          deceptive_count += 1



In [None]:
print(truthful_count)
print(deceptive_count)

3
40


In [None]:
most_relevant_chunk, highest_similarity = find_most_relevant_chunk(case_facts, "Arias pleaded guilty on Septermber 11")

Token indices sequence length is longer than the specified maximum sequence length for this model (1370 > 512). Running this sequence through the model will result in indexing errors
