# Preparation

In [2]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd
import numpy as np
import re

In [2]:
# LOAD BERT SENTENCE MODEL
# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
# try different encoding and seperate identifiers
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

# Set up corpus

In [8]:
filename = '/Users/macos/Documents/OneDrive/LEARN_EDI/ThesisProject/Code/ThesisProject/PoynterCovid19Database_Reference_Article.csv'
fields = ['docID', 'content', 'accuracy', 'date', 'region', 'explanation', 'reference_url', 'reference_html', 'reference_text']
df = pd.read_csv(filename, usecols = fields)
sentences = df.content.tolist()
sentences

                    

['The confinement of Spain has been “the most severe in the world”',
 'CDCs denied that SARSCoV2 was isolated and said PCR tests don’t work',
 'These people were gasified in a beach, as a response to COVID-19 resurgences',
 'COVID-19 is caused by a bacteria and some countries found the cure in antibiotics',
 'H1N1 vaccine in Brazil is made in China',
 'COVID-19 can be prevented and fought through washing your teeth',
 'Alberto Fernández (president of Argentina) said: “With Macri we would have had 10,000 deaths from coronavirus”',
 'A study says that the COVID-19 vaccine will cause sterility',
 'COVID-19 vaccine causes cancer, genetic modifications in humans and causes ‘homosexuality’',
 'Exposing the public to the virus will end the pandemic before vaccines are available',
 'Doctor Luis Martínez made several claims that circulated in viral videos.',
 'The WHO says that covid is an endemic virus.',
 'No vaccines against COVID-19 in the world have been through pre-clinical trials',
 'The

In [9]:
# A corpus is a list with documents split by sentences.
# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

# print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - note includes negative values [-0.31657708 -0.08714552  0.08483327 -0.06961886 -0.48633435 -0.6176461
  0.68463004  1.2688848   0.7726604  -0.31759274 -0.30851185  0.13908762
  0.1218169   0.65443325  0.5437663   0.6161888  -0.33997437 -0.48260477
  0.0212503  -1.025387    0.10687941 -0.38362727  0.64766496 -0.5637954
  1.200797   -0.35971817 -0.56121314 -0.8324255  -0.8415982   0.02972327
 -0.0577215  -0.2680142  -0.5989976   0.01630904  0.14596373  1.0827708
 -0.06652933 -0.81775737 -0.10552037 -0.6681382   0.8377586  -0.3785214
  0.31860125  0.03293499 -0.3352435  -0.08806635  0.51710826  0.12831512
  0.8077717  -0.6402124  -0.6772317   0.79344755  0.8738781   0.15799794
 -0.27087086 -0.66594213 -0.41702494 -0.50691676 -0.26818877 -0.7542284
  0.11915894  0.8412625  -0.45978346  0.25967634 -0.40939677  0.47996244
  0.48410255  0.6231768  -1.3762779  -0.6390427   0.1083585  -1.2701049
 -0.27909172 -0.42796713 -0.79435205 -0.98953724 -0.3440558   0.14062

In [12]:
np.save('corpus_content_sentence_embedding.npy', sentence_embeddings)
copy = np.load('corpus_content_sentence_embedding.npy')
copy

array([[-0.31657708, -0.08714552,  0.08483327, ..., -1.0332795 ,
         0.27905363, -0.16434515],
       [ 0.60393685,  0.9642001 ,  0.8390832 , ...,  0.5080613 ,
        -0.41359526, -0.07227249],
       [ 0.25111535, -0.17284997,  0.7768563 , ..., -0.20110221,
        -0.05521448, -0.15867141],
       ...,
       [-0.4015222 , -0.40147984,  0.73259574, ..., -0.12461199,
         0.71240634,  0.2060195 ],
       [ 0.2847222 ,  0.28602582,  0.68743694, ...,  0.0647982 ,
        -0.2579594 ,  0.04684826],
       [ 0.15641083,  0.57378566,  0.82513463, ...,  0.50043845,
         0.12180942,  0.39115906]], dtype=float32)

# PERFORM SEMANTIC SEARCH

In [3]:
query_id = 0
RankedIROutput = open('document_retrieval_results.txt', 'w')

# step1: extract the original queries from file
query_ranked_file = open("queries_toy.txt", "r")
query_ranked_str = query_ranked_file.read()
query_ranked_list = re.split(r'\n', query_ranked_str)
query_ranked_list

['This video about what Vitamin C has been proven to do is amazing! Didn’t know 60 out of 60 cases of polio were cured with Vitamin C in the 1930’s. And yes, it kills viruses.',
 'It keeps getting bizarre ! Holding your breath may increase risk of getting Covid-19, say IIT Madras researchers | Hindustan Times',
 'Drinking warm water kills viruses, gargling salt water, breath holding to self diagnose fibrosis..',
 '@Jar_O_Cats May cost $3,000+, at least we can test. — Only three US states can test for coronavirus, says public lab group',
 'Bathi he said 5G brought Corona Virus',
 "COVID: China's Sinovac vaccine found to be 50.4% effective in Brazil trials",
 "I reckon my current tube of salt and vinegar @Pringles could be a cure for coronavirus, I'm not even a quarter of the way down the tube and I feel my lips and mouth disintegrating, coronavirus would have no chance! Face savouring food I love the pain they cause me"]