In [None]:
# To mount our Google Drive folder if we are using Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installation Steps

In [None]:
#Install tqdm for progress bars
!pip install --quiet tqdm

#Install pandas
!pip install --quiet pandas

#Install sklearn
!pip install --quiet sklearn

#Install termcolor
!pip install --quiet termcolor

[0m

# Verify Installations

In [None]:
!sudo apt install python3

# Data Preprocessing

## Fetch data from filtered csvs

In [None]:
import pandas as pd
import numpy as np

from termcolor import colored
from tqdm import tqdm

# testPath = "./filtered_data/test.csv"

# For Google Colab
testPath = "./drive/MyDrive/preprocessed_data/test.csv"
dfTest = pd.read_csv(testPath)

dfTest.head()

Unnamed: 0,claim,label,annotated-questions,label-binary
0,"Says Kentucky Rep. Andy Barr ""would let shady ...",half-true,"1. Has Barr received $36,550 from payday lende...",1
1,"""New reports show Kevin Nicholson made over $1...",barely-true,1. Did Nicholson make $1 million consulting fo...,0
2,Says that unless the recipient called back abo...,pants-fire,1. Will people be taken into local police cust...,0
3,"""Donald Trump said he was excited for the 2008...",half-true,"1. Was Donald Trump ""Excited"" for the 2008 hou...",1
4,"""The president has said the national security ...",barely-true,1. Has Obama cited climate change as the top n...,0


## Optional code to verify dataframe

In [None]:
#Questions are of type str
print(type(dfTest.loc[0, 'annotated-questions']))
dfTest.loc[0, 'annotated-questions']

<class 'str'>


"1. Has Barr received $36,550 from payday lenders? \n2. Did Barr vote for legislation that would weaken restrictions for payday lenders? \n3. Are there any protections for service members using payday lending services? \n4. Has Barr's voting record directly affected protection for veterans against payday lenders? \n5. Did Bar accept $36,000 from pay day lenders? \n6. Does Bar let pay day lenders take advantage of troops?"

## Preprocess Question strings

In [None]:
# Convert each question string into an array for easier evaluation
dfTest['annotated-questions'] = dfTest['annotated-questions'].str.split('\n')
dfTest.head()

Unnamed: 0,claim,label,annotated-questions,label-binary
0,"Says Kentucky Rep. Andy Barr ""would let shady ...",half-true,"[1. Has Barr received $36,550 from payday lend...",1
1,"""New reports show Kevin Nicholson made over $1...",barely-true,[1. Did Nicholson make $1 million consulting f...,0
2,Says that unless the recipient called back abo...,pants-fire,[1. Will people be taken into local police cus...,0
3,"""Donald Trump said he was excited for the 2008...",half-true,"[1. Was Donald Trump ""Excited"" for the 2008 ho...",1
4,"""The president has said the national security ...",barely-true,[1. Has Obama cited climate change as the top ...,0


## Evaluate Baseline Quin Performance

In [None]:
# Create default cols to store quin results
dfTest['Quin-supporting'] = 0
dfTest['Quin-refuting'] = 0
dfTest['Quin-label'] = ""
dfTest['Quin-label-binary'] = 0
dfTest['Quin-evidence'] = ""
dfTest['Quin-performance'] = 0

dfTest.head()

Unnamed: 0,claim,label,annotated-questions,label-binary,Quin-supporting,Quin-refuting,Quin-label,Quin-label-binary,Quin-evidence,Quin-performance
0,"Says Kentucky Rep. Andy Barr ""would let shady ...",half-true,"[1. Has Barr received $36,550 from payday lend...",1,0,0,,0,,0
1,"""New reports show Kevin Nicholson made over $1...",barely-true,[1. Did Nicholson make $1 million consulting f...,0,0,0,,0,,0
2,Says that unless the recipient called back abo...,pants-fire,[1. Will people be taken into local police cus...,0,0,0,,0,,0
3,"""Donald Trump said he was excited for the 2008...",half-true,"[1. Was Donald Trump ""Excited"" for the 2008 ho...",1,0,0,,0,,0
4,"""The president has said the national security ...",barely-true,[1. Has Obama cited climate change as the top ...,0,0,0,,0,,0


In [None]:
!pip install aiohttp
!pip install nest-asyncio
!pip install BeautifulSoup

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

### Note: 

This evaluation relies on the Quin backend hosted at: https://empty-meadow-488.fly.dev/api2


In [None]:
import asyncio
import aiohttp
import nest_asyncio
import json

nest_asyncio.apply()

# define the base URL and parameters
quin_url = 'https://empty-meadow-488.fly.dev/api2'

async def fetchResutls(url, claim):
  params = {'query': claim}
  async with aiohttp.ClientSession() as session:
        async with session.get(url, params=params) as response:
            content = await response.text()
            return content

async def evaluate(df):
  for i in tqdm(range(len(df))):
    claim = df.loc[i, 'claim']
    response = await fetchResutls(quin_url, claim)
    response = json.loads(response)

    if response['data']['type'] == "statement":
      df.loc[i, 'Quin-supporting'] = response['data']['supporting']
      df.loc[i, 'Quin-refuting'] = response['data']['refuting']
      df.loc[i, 'Quin-label'] = response['data']['veracity_rating']
      evidence = json.dumps(response['data']['results'])
      if len(evidence) > 32700:
        evidence = evidence[:32700] + " ...evidence too long"
      df.loc[i, 'Quin-evidence'] = evidence
    else:
      df.loc[i, 'Quin-supporting'] = 0
      df.loc[i, 'Quin-refuting'] = 0
      df.loc[i, 'Quin-label'] = "Unable to evaluate"
      evidence = json.dumps(response['data']['results'])
      if len(evidence) > 32700:
        evidence = evidence[:32700] + " ...evidence too long"
      df.loc[i, 'Quin-evidence'] = evidence

# print the response content
dfResult = asyncio.run(evaluate(dfTest))

100%|██████████| 200/200 [08:56<00:00,  2.68s/it]


AttributeError: ignored

In [None]:
dfTest.head()

Unnamed: 0,claim,label,annotated-questions,label-binary,Quin-supporting,Quin-refuting,Quin-label,Quin-label-binary,Quin-evidence,Quin-performance
0,"Says Kentucky Rep. Andy Barr ""would let shady ...",half-true,"[1. Has Barr received $36,550 from payday lend...",1,1,0,Not enough evidence,0,"[{""snippet"": ""<b> Says Kentucky Rep. Andy Barr...",0
1,"""New reports show Kevin Nicholson made over $1...",barely-true,[1. Did Nicholson make $1 million consulting f...,0,4,0,Probably True,0,"[{""snippet"": ""<b> \""New reports show Kevin Nic...",0
2,Says that unless the recipient called back abo...,pants-fire,[1. Will people be taken into local police cus...,0,2,0,Not enough evidence,0,"[{""snippet"": ""Legislative Regulation c. Letter...",0
3,"""Donald Trump said he was excited for the 2008...",half-true,"[1. Was Donald Trump ""Excited"" for the 2008 ho...",1,3,0,Probably True,0,"[{""snippet"": "" look at his history. <b>donald ...",0
4,"""The president has said the national security ...",barely-true,[1. Has Obama cited climate change as the top ...,0,0,0,Not enough evidence,0,[],0


In [None]:
dfTest.to_csv("./result/evalTest-step0.csv", index=False, encoding = 'utf-8-sig', header=True, )

## Compute fact-checking performance

### Step 1: Convert Quin labels into binary values

We shall reclassify original labels to 3 categories, True/False/Not evaluated represented by 1/0/-1 for easier evaluation, similar to the annotated dataset

**New True labels are represented by:**

Original labels:  
1. Probably True

**New False labels are represented by:**

Original labels:  
1. Probably False

**New Not evaluated labels are represented by:**

Original labels:  
1. Not enough evidence
2. ? Ambiguous
3. Unable to evaluate


In [None]:
def map_labels(df):
    # map labels to 1, 0, -1 according to the above classification
    label_equivalent = {
        'Probably True': 1,
        'Probably False': 0,
        '? Ambiguous': -1,
        'Unable to evaluate': -1,
        'Not enough evidence': -1,
    }
    
    # convert the 'label' column values to 1 or 0
    df['Quin-label-binary'] = df['Quin-label'].map(label_equivalent)
    return df

dfTest = map_labels(dfTest)
dfTest.head()

Unnamed: 0,claim,label,annotated-questions,label-binary,Quin-supporting,Quin-refuting,Quin-label,Quin-label-binary,Quin-evidence,Quin-performance
0,"Says Kentucky Rep. Andy Barr ""would let shady ...",half-true,"[1. Has Barr received $36,550 from payday lend...",1,1,0,Not enough evidence,-1,"[{""snippet"": ""<b> Says Kentucky Rep. Andy Barr...",0
1,"""New reports show Kevin Nicholson made over $1...",barely-true,[1. Did Nicholson make $1 million consulting f...,0,4,0,Probably True,1,"[{""snippet"": ""<b> \""New reports show Kevin Nic...",0
2,Says that unless the recipient called back abo...,pants-fire,[1. Will people be taken into local police cus...,0,2,0,Not enough evidence,-1,"[{""snippet"": ""Legislative Regulation c. Letter...",0
3,"""Donald Trump said he was excited for the 2008...",half-true,"[1. Was Donald Trump ""Excited"" for the 2008 ho...",1,3,0,Probably True,1,"[{""snippet"": "" look at his history. <b>donald ...",0
4,"""The president has said the national security ...",barely-true,[1. Has Obama cited climate change as the top ...,0,0,0,Not enough evidence,-1,[],0


In [None]:
dfTest.to_csv("./result/evalTest-step1.csv", index=False, encoding = 'utf-8-sig', header=True, )

### Step 2: Evaluate baseline Quin performance

Quin performance col will reflect 1 if quin's result matches that of the annotator's and 0 otherwise

In [None]:
def evalQuin(df):
  Accuracy= 0
  Unmatched = 0
  Matched = 0
  UnableToEval= 0
  
  for i in tqdm(range(len(df))):
    if df.loc[i, 'label-binary'] == df.loc[i, 'Quin-label-binary']:
      df.loc[i, 'Quin-performance'] = 1
      Matched += 1
    else:
      df.loc[i, 'Quin-performance'] = 0
      Unmatched += 1
      if df.loc[i, 'Quin-label'] in ['? Ambiguous','Unable to evaluate','Not enough evidence']:
        UnableToEval += 1


  Accuracy = round((Matched / (Unmatched + Matched)) * 100)
  print("\nAccuracy: {}%".format(Accuracy))
  print("Matched: {}".format(Matched))
  print("Unmatched: {}".format(Unmatched))
  print("UnableToEval: {}".format(UnableToEval))

evalQuin(dfTest)


100%|██████████| 200/200 [00:00<00:00, 8687.46it/s]


Accuracy: 16%
Matched: 33
Unmatched: 167
UnableToEval: 143





In [None]:
dfTest.to_csv("./result/evalTest-step2.csv", index=False, encoding = 'utf-8-sig', header=True, )