# SECTION 0: Install packages if required

Install necessary packages as required

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

In [None]:
pip install fuzzywuzzy

In [None]:
pip install python-Levenshtein

In [None]:
pip install polyfuzz

In [None]:
pip install flair

In [None]:
pip install polyfuzz[flair]


# SECTION 1: Cleaning the Dataset

Importing necessary libraries

In [3]:
import json
import random
import re
import os
import sys
import string
import math
import numpy as np
from tqdm import tqdm
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Import files from Drive

Link for the Dataset:
https://drive.google.com/drive/folders/10CQ_BADLLNgALyTwC_18gmYzWL58cpG-?usp=sharing

The Google Drive folder is shared, just add it to the base directory of your Drive ("/content/drive/MyDrive") and the mounting will work fine

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Function to read data from json file to a list. instance_num is used to read the first n data in the json.

In [1]:
def load_dataset_json(path, instance_num = 1e7):
    data = []
    with open(path, 'r') as openfile:
        #data = json.load(openfile)
        for iline, line in enumerate(openfile.readlines()):
            data.append(json.loads(line))
            if iline + 1 >= instance_num:
                break
    return data

Function to read data from Wikipedia articles

In [13]:
def load_wiki_json(path, instance_num = 1e5):
    data = []
    with open(path, 'r') as openfile:
        #data = json.load(openfile)
        for iline, line in enumerate(openfile.readlines()):
            data.append(json.loads(line))
            if iline + 1 >= instance_num:
                break
    for d in data:
      d.pop('lines')            
    return data

Loading the dataset

In [None]:
test_path = '/content/drive/MyDrive/data/fever-data/test.jsonl'
test_data = load_dataset_json(path=test_path, instance_num=20)

for sample in test_data[:10]:
    print(sample)

{'id': 89296, 'claim': 'Henry Spencer is played by a Greek actor.'}
{'id': 78554, 'claim': 'John Ritter died in October.'}
{'id': 83809, 'claim': '13 Reasons Why is the only television series of 2012 in the drama-mystery genre.'}
{'id': 49758, 'claim': 'Playboy is a magazine.'}
{'id': 22973, 'claim': 'Alternative metal is the genre in which Alice in Chains usually performs.'}
{'id': 181494, 'claim': 'Sam Peckinpah directed The Wild Bunch.'}
{'id': 161592, 'claim': "The St. John's water dog is a breed of domestic dog that was first bred in Newfoundland."}
{'id': 117342, 'claim': 'Horseshoe crabs are not used in fertilizer.'}
{'id': 172204, 'claim': 'Sia (musician) has received an award presented by the cable channel MTV.'}
{'id': 95552, 'claim': 'Artificial intelligence raises concern.'}


Loading all wikipedia articles

In [14]:
wiki_data=[]
os.chdir('/content/drive/MyDrive/data/wiki-pages')
i=1
for file in os.listdir():
  print('\r', math.floor(100*i/109), '% done', end = '')
  wiki_data.extend(load_wiki_json(path = file))
  i += 1
os.chdir('..')  
os.chdir('..')  
wiki_data.remove(wiki_data[0])

 100 % done

Cleaning the claims to get tokens

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
for sample in test_data:
    tokens =  re.split(r'[ -,._]', sample['claim'])
    tokens = list(filter(lambda token: token not in string.punctuation, tokens))
    sample['token'] = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w.lower() in stop_words]

**Sample Output**: 

> Tokens obtained from the claims



In [None]:
for sample in test_data[:10]:   
    print(sample['token'])

['henry', 'spencer', 'played', 'greek', 'actor']
['john', 'ritter', 'died', 'october']
['13', 'reason', 'television', 'series', '2012', 'drama-mystery', 'genre']
['playboy', 'magazine']
['alternative', 'metal', 'genre', 'alice', 'chain', 'usually', 'performs']
['sam', 'peckinpah', 'directed', 'wild', 'bunch']
['st', 'john', 'water', 'dog', 'breed', 'domestic', 'dog', 'first', 'bred', 'newfoundland']
['horseshoe', 'crab', 'used', 'fertilizer']
['sia', 'musician', 'received', 'award', 'presented', 'cable', 'channel', 'mtv']
['artificial', 'intelligence', 'raise', 'concern']


Cleaning the wiki articles to get tokens

In [None]:
n = len(wiki_data)
i = 1
prt = 0
for sample in wiki_data:
  if math.floor(100*(i/n)) == prt:
    print('\r', prt, '% done', end = '')
    prt += 1
  tokens = re.split(r'[_,–. ]', sample['id'])
  tokens = list(filter(lambda token: token not in string.punctuation, tokens))
  sample['token'] = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w.lower() in stop_words]
  i+=1

 10 % done

**Sample Output**: 

> Tokens obtained from the wiki articles

In [None]:
for sample in wiki_data[:10]:
    print(sample['token'])

['sin', 'sukju']
['south', 'oroville', 'california']
['southwest', 'golf', 'classic']
['st', "philip's", 'cathedral', 'san', 'felipe']
['st', "bartholomew's", 'church', 'chipping']
['soulmate', '-lrb-disambiguation-rrb-']
['spanish', 'flu', 'research']
['society', 'cultural', 'anthropology']
['skip', 'ewing']
['somerset', 'academy', '-lrb-pembroke', 'pine', 'florida-rrb-']


# SECTION 2: Document Retrieval

Import necessary packages

In [7]:
from polyfuzz.models import TFIDF
from polyfuzz import PolyFuzz
import fuzzywuzzy
from fuzzywuzzy import fuzz
from polyfuzz.models import Embeddings
from flair.embeddings import TransformerWordEmbeddings
from operator import length_hint

Setting up the thresholds

In [8]:
fuzz_threshold = 55
similarity_threshold = 0.8
num_doc = 10
num_claim = 5

Document Selection:
>  Two models designed:
* Tf - Idf
* BERT

>   Document selection done in 3 layers:
1.   Layer 1: Filter out articles which has similarity < fuzz_threshold, using fuzzy matching
2.   Layer 2: Get Tf-Idf similarity score
3.   Layer 3: Get a similiraty score using a BERT model

Each article which crosses the threshold are assigned a score considering the similarity obtained from all 3 layers. Top 10 (can be varied by the variable num_doc) articles are chosen for the next step.







Designing the models

In [None]:
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0)
model1 = PolyFuzz(tfidf)
embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
bert = Embeddings(embeddings, min_similarity=0)
model2 = PolyFuzz(bert)

Randomly selecting 5 (variable num_claim) claims for which the evidences are retrieved

In [None]:
res = random.sample(range(0, len(test_data)), num_claim)

Document Selection

In [None]:
document = []
for k in res:
    print('Starting on claim: ', test_data[k]['claim'])
    tup=[]
    n = len(wiki_data)
    prt = 0
    for i in range(n):
      if math.floor(100*(i+1)/n) == prt:
        print("\r",prt,'% done',end="")
        prt += 1
      count = 0
      similarity = fuzz.token_sort_ratio(test_data[k]['token'],wiki_data[i]['token'])
      if similarity > fuzz_threshold:
        model1.match(test_data[k]['token'],wiki_data[i]['token'])
        l = length_hint(sample['token'])
        count = round(similarity*l*0.01,1)
        for p in model1.get_matches()['Similarity']:
          if p > similarity_threshold:
            count+=1
        if count > round(similarity*l*0.01,1):
          model2.match(test_data[k]['token'],wiki_data[i]['token'])
          for p in model2.get_matches()['Similarity']:
            if p > similarity_threshold:
              count+=1
        if count > 3:
          tup.append((i,count)) 
    tup.sort(key = lambda x: x[1], reverse=True)
    document.append(tup[:num_doc])
    del tup
    print('\n')
del model1
del model2    

Starting on claim:  Henry Spencer is played by a Greek actor.
 100 % done

Starting on claim:  Sam Peckinpah directed The Wild Bunch.
 100 % done

Starting on claim:  Horseshoe crabs are not used in fertilizer.
 100 % done

Starting on claim:  The Portland Trail Blazers have thrice gone to the NBA Finals.
 100 % done

Starting on claim:  Brazilian Girls is a group.
 100 % done



Output for Section 2:

**DOCUMENT RETRIEVAL**

In [None]:
i = 0
for doc in document:
  if i>=num_claim:
    break
  print("Claim: ",test_data[res[i]]['claim'])
  for d in doc:
    print(d,'\t\t',wiki_data[d[0]]['id'])
  print('\n')
  i+=1  

Claim:  Henry Spencer is played by a Greek actor.
(2625886, 7.2) 		 Henry_Spencer_Ashbee
(2638145, 7.2) 		 Henry_C._Spencer
(2660251, 7.2) 		 Henry_E._Spencer
(3469927, 7.2) 		 Henry_Spencer_Palmer
(3566599, 7.2) 		 Lord_Henry_Spencer
(4560605, 7.1) 		 Henry_Spencer_Berkeley
(3096802, 6.9) 		 List_of_actors_who_have_played_Sherlock_Holmes
(4039115, 6.9) 		 List_of_Greek_actors
(4542972, 6.9) 		 Henry_Elvins_Spencer
(4555676, 6.9) 		 Henry_Spencer


Claim:  Sam Peckinpah directed The Wild Bunch.
(3974591, 6.9) 		 Sam_Peckinpah_bibliography
(4004022, 6.9) 		 Sam_Peckinpah
(5278822, 6.9) 		 Butch_Cassidy's_Wild_Bunch
(4171832, 4.9) 		 Bueng_Sam_Phan_District


Claim:  Horseshoe crabs are not used in fertilizer.
(3937879, 7.2) 		 Horseshoe_crab
(3844797, 7.0) 		 Mangrove_horseshoe_crab
(720281, 6.9) 		 Atlantic_horseshoe_crab
(1603546, 5.5) 		 Battle_of_the_House_in_the_Horseshoe
(3502453, 5.5) 		 Horseshoe_Crater
(3776655, 5.2) 		 Darling's_horseshoe_bat
(3803243, 5.2) 		 Dent's_horseshoe

# SECTION 3: Sentence Selection

Import packages

In [None]:
from nltk.tokenize import sent_tokenize

Setting number of evidence sentences required

In [None]:
num_sentence = 5

Results obtained from normal string matching without any model

In [None]:
i = 0
for doc in document:
  if i>=num_claim:
    break
  print('Claim: ',test_data[res[i]]['claim'])
  print('Evidence: ')
  evidence=[]
  for d in doc:
    token_text = sent_tokenize(wiki_data[d[0]]['text'])
    for s in token_text:
      t = re.split(r'[_,–. ]', s)
      t = list(filter(lambda token: token not in string.punctuation, t))
      tok = [lemmatizer.lemmatize(w.lower()) for w in t if not w.lower() in stop_words]
      similarity = fuzz.token_sort_ratio(test_data[res[i]]['token'],tok)+d[1]
      evidence.append((s,similarity))
  evidence.sort(key = lambda x: x[1], reverse=True)
  if len(evidence) < 1:
    print('Not enough evidence')
  for e in evidence[:num_sentence]:
    print(e[0])
  print('\n')
  i+=1  

Claim:  Henry Spencer is played by a Greek actor.
Evidence: 
This is a list of Greek actors .
Henry Spencer -LRB- born 1955 -RRB- is a Canadian computer programmer and space enthusiast .
The list of actors who have played Sherlock Holmes in film , television , stage , or radio includes :
He is coauthor , with David Lawrence , of the book Managing Usenet .
Spencer was succeeded as mayor by Mark P. Taylor in 1851 .


Claim:  Sam Peckinpah directed The Wild Bunch.
Evidence: 
A list of books and essays about Sam Peckinpah :   Peckinpah
Peckinpah 's combative personality , marked by years of alcohol and drug abuse , affected his professional legacy .
He was given the nickname `` Bloody Sam '' owing to the violence in his films .
It was popularized by the 1969 movie , Butch Cassidy and the Sundance Kid , and took its name from the original Wild Bunch .
Peckinpah 's films generally deal with the conflict between values and ideals , and the corruption of violence in human society .


Claim:  H

Sentence Selection:
>  Two models designed:
* Tf - Idf
* BERT

>   Sentence selection done in 2 layers:
1.   Layer 1: Get Tf-Idf similarity score
2.   Layer 2: Get a similiraty score using a BERT model

Each claim - candidate sentence pair passed thorough both the layers. Top 5 (can be varied by the variable num_sentences) evidence sentences are chosen as the final result.







Design the 2 models

In [None]:
embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
bert = Embeddings(embeddings, min_similarity=0)
model4 = PolyFuzz(bert)
tfidf = TFIDF(n_gram_range=(3, 3))
model3 = PolyFuzz(tfidf)

## Final Output from the Project:

In [None]:
k = 0
for doc in document:
  if k>=num_claim:
    break
  print('######################')
  print('Claim: ',test_data[res[k]]['claim'])
  print('Evidence: ')
  evidence=[]
  for d in doc:
    token_text = sent_tokenize(wiki_data[d[0]]['text'])
    for s in token_text:
      t = re.split(r'[_,–. ]', s)
      t = list(filter(lambda token: token not in string.punctuation, t))
      tok = [lemmatizer.lemmatize(w.lower()) for w in t if not w.lower() in stop_words]
      if len(tok) < 2:
        continue
      model3.match(test_data[res[k]]['token'],tok)
      count = 0
      l = length_hint(test_data[res[k]]['token'])
      for i in model3.get_matches()['Similarity']:
        if i > similarity_threshold:
          count+=i
      if count>0:
        model4.match(test_data[res[k]]['token'],tok)
        for i in model4.get_matches()['Similarity']:
          if i > similarity_threshold:
            count+=i
      evidence.append((s,count))
  evidence.sort(key = lambda x: x[1], reverse=True)
  if len(evidence) < 1:
    print('Not enough evidence\n')
    k+=1
    continue
  for e in evidence[:num_sentence]:
    print(e[0])
  print('\n')
  k+=1  

######################
Claim:  Henry Spencer is played by a Greek actor.
Evidence: 
Henry Spencer Ashbee -LRB- 21 April 1834 -- 29 July 1900 -RRB- was a book collector , writer , and bibliographer .
Henry Christian Spencer -LRB- 1915 -- 2000 -RRB- was an American chemical engineer and executive at the Kerite Company in Seymour , Connecticut .
Henry Evans Spencer -LRB- born June 13 , 1807 in Columbia - now part of Cincinnati -RRB- was a notable Cincinnati resident and was Mayor of Cincinnati from 1843-1851 .
Major General Henry Spencer Palmer -LRB- 30 April 1838 -- 10 February 1893 -RRB- was a British army military engineer and surveyor , noted for his work in developing Yokohama harbor in the Empire of Japan as a foreign advisor to the Japanese government
Lord Henry John Spencer -LRB- 20 December 1770 -- 3 July 1795 -RRB- was a British diplomat and politician .


######################
Claim:  Sam Peckinpah directed The Wild Bunch.
Evidence: 
David Samuel `` Sam '' Peckinpah -LRB- -LSB

#ACCURACY (*OPTIONAL)

We can test accuracy by testing on the train set. Choosing some random samples from the training set, we can run our model on it and comparing them manually with the evidence already provided in them.

In [4]:
train_path = '/content/drive/MyDrive/data/fever-data/train.jsonl'
train_data = load_dataset_json(path=train_path, instance_num=20)

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
for sample in train_data:
    tokens =  re.split(r'[ -,._]', sample['claim'])
    tokens = list(filter(lambda token: token not in string.punctuation, tokens))
    sample['token'] = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w.lower() in stop_words]

In [9]:
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0)
model1 = PolyFuzz(tfidf)
embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
bert = Embeddings(embeddings, min_similarity=0)
model2 = PolyFuzz(bert)

In [10]:
res = random.sample(range(0, len(train_data)), num_claim)

In [11]:
for i in res:
  print('#####################')
  for s1 in train_data[i]['evidence']:
    for s2 in s1:
      print(s2[2])

#####################
Roman_Atwood
Roman_Atwood
#####################
History_of_art
#####################
Stranger_Things
#####################
Nikolaj_Coster-Waldau
Fox_Broadcasting_Company
#####################
Ryan_Gosling
Chad


In [None]:
document = []
for k in res:
    print('Starting on claim: ', train_data[k]['claim'])
    tup=[]
    n = len(wiki_data)
    prt = 0
    for i in range(n):
      if math.floor(100*(i+1)/n) == prt:
        print("\r",prt,'% done',end="")
        prt += 1
      count = 0
      similarity = fuzz.token_sort_ratio(train_data[k]['token'],wiki_data[i]['token'])
      if similarity > fuzz_threshold:
        model1.match(train_data[k]['token'],wiki_data[i]['token'])
        l = length_hint(sample['token'])
        count = round(similarity*l*0.01,1)
        for p in model1.get_matches()['Similarity']:
          if p > similarity_threshold:
            count+=1
        if count > round(similarity*l*0.01,1):
          model2.match(train_data[k]['token'],wiki_data[i]['token'])
          for p in model2.get_matches()['Similarity']:
            if p > similarity_threshold:
              count+=1
        if count > 3:
          tup.append((i,count)) 
    tup.sort(key = lambda x: x[1], reverse=True)
    document.append(tup[:num_doc])
    del tup
    print('\n')
del model1
del model2    

In [None]:
i = 0
for doc in document:
  if i>=num_claim+1:
    break
  print("Claim: ",train_data[res[i]]['claim'])
  for d in doc:
    print(d,'\t\t',wiki_data[d[0]]['id'])
  print('\n')
  i+=1  

Claim:  Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
(4022682, 7.4) 		 List_of_Manila_Broadcasting_Company_stations
(2087124, 7.3) 		 Far_East_Broadcasting_Company


Claim:  Roman Atwood is a content creator.
(3242020, 5.4) 		 Acts_of_Roman_Congregations
(4822710, 5.4) 		 Content_creation
(5123468, 4.4) 		 Romani_contemporary_art


Claim:  History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.


Claim:  Adrienne Bailon is an accountant.
(4586735, 7.6) 		 Adrienne_Bailon
(3263190, 5.6) 		 ASEAN_Federation_of_Accountants
(3264933, 5.6) 		 Accountant_General
(1783151, 5.5) 		 International_Federation_of_Accountants
(2890988, 5.5) 		 Chartered_Accountants_Ireland
(4616484, 5.5) 		 Accountant_General_of_the_Federation
(4681547, 5.5) 		 Certified_National_Accountant
(993021, 5.4) 		 Forensic_accountant
(1882480, 5.4) 		 Pan_African_Federation_of_Accountants
(2058077, 5.4) 		 Nation

In [None]:
k = 0
for doc in document:
  if k>=num_claim+1:
    break
  print('######################')
  print('Claim: ',train_data[res[k]]['claim'])
  print('Evidence: ')
  evidence=[]
  for d in doc:
    token_text = sent_tokenize(wiki_data[d[0]]['text'])
    for s in token_text:
      t = re.split(r'[_,–. ]', s)
      t = list(filter(lambda token: token not in string.punctuation, t))
      tok = [lemmatizer.lemmatize(w.lower()) for w in t if not w.lower() in stop_words]
      if len(tok) < 2:
        continue
      model3.match(train_data[res[k]]['token'],tok)
      count = 0
      l = length_hint(train_data[res[k]]['token'])
      for i in model3.get_matches()['Similarity']:
        if i > similarity_threshold:
          count+=i
      if count>0:
        model4.match(train_data[res[k]]['token'],tok)
        for i in model4.get_matches()['Similarity']:
          if i > similarity_threshold:
            count+=i
      evidence.append((s,count,wiki_data[d[0]]['id']))
  evidence.sort(key = lambda x: x[1], reverse=True)
  if len(evidence) < 1:
    print('Not enough evidence\n')
    k+=1
    continue
  for e in evidence[:num_sentence]:
    print(e[2],'\n',e[0])
  print('\n')
  k+=1  