# Preparation

In [3]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd
import numpy as np
from stemming.porter2 import stem
import re

# preprocessing method
def string_tokenise(string):  # return list
    result = re.findall(r"\w+", string)
    return result


def case_fold(list1):  # return list
    result = [word.lower() for word in list1]
    #     string = ' '.join([str(elem) for elem in list1])
    #     result = string.lower().split() #lower() is the same as casefold()
    return result


def stopping(list1):  # return list
    stopfile = open("englishST.txt", 'r')
    stopwords = stopfile.read().split()
    result = [items for items in list1 if items not in stopwords]
    return result


def normalise(list1):  # return list
    result = []
    for item in list1:
        result.append(stem(item))
    return result

In [4]:
# LOAD BERT SENTENCE MODEL
# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
# try different encoding and seperate identifiers
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

# Set up and preprocess

In [6]:
filename = '/Users/macos/Documents/OneDrive/LEARN_EDI/ThesisProject/Code/ThesisProject/PoynterCovid19Database_Reference_Article.csv'
fields = ['docID', 'content', 'accuracy', 'date', 'region', 'explanation', 'reference_url', 'reference_html', 'reference_text']
df = pd.read_csv(filename, usecols = fields)
sentences = df.explanation.tolist().copy()
original_sentences = df.content.tolist().copy()


# prepare queries in list
# RankedIROutput = open('SBERT_document_retrieval_results.txt', 'w')

# step1: extract the original queries from file
query_ranked_file = open("queries_toy.txt", "r")
query_ranked_str = query_ranked_file.read()
query_ranked_list = re.split(r'\n', query_ranked_str)
queries = query_ranked_list.copy()
original_queries = query_ranked_list.copy()

                    

In [7]:
query_ranked_list

['This video about what Vitamin C has been proven to do is amazing! Didn’t know 60 out of 60 cases of polio were cured with Vitamin C in the 1930’s. And yes, it kills viruses.',
 'It keeps getting bizarre ! Holding your breath may increase risk of getting Covid-19, say IIT Madras researchers | Hindustan Times',
 'Drinking warm water kills viruses, gargling salt water, breath holding to self diagnose fibrosis..',
 '@Jar_O_Cats May cost $3,000+, at least we can test. — Only three US states can test for coronavirus, says public lab group',
 'Bathi he said 5G brought Corona Virus',
 "COVID: China's Sinovac vaccine found to be 50.4% effective in Brazil trials",
 "I reckon my current tube of salt and vinegar @Pringles could be a cure for coronavirus, I'm not even a quarter of the way down the tube and I feel my lips and mouth disintegrating, coronavirus would have no chance! Face savouring food I love the pain they cause me"]

In [8]:
# preprocess query 
query_id = 0
for query in query_ranked_list:
    query_id = query_id + 1  # query index starts from 1
    query_term = normalise(
        stopping(case_fold(string_tokenise(query))))  # same preprocessing as for indexing
    separator = ' '
    queries[query_id-1] = separator.join(query_term)

# preprocess corpus
sentence_id = 0
for sentence in sentences:
    sentence_id = sentence_id + 1  # sentence index starts from 1
    sentence_term = normalise(
        stopping(case_fold(string_tokenise(sentence))))  # same preprocessing as for indexing
    separator = ' '
    sentences[sentence_id-1] = separator.join(sentence_term)


In [62]:
# A corpus is a list with documents split by sentences.
# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

# print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

np.save('SBERT_preprocessed_explanation_embeddings.npy', sentence_embeddings)
print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - note includes negative values [-4.46851373e-01  6.44912839e-01  2.68110126e-01  2.02320486e-01
 -1.23803256e-04 -5.24920821e-01 -6.90814331e-02  7.68318594e-01
  7.40365326e-01 -3.10796499e-01  1.70575514e-01  7.55888641e-01
  6.26585066e-01  3.98445874e-01  4.81432915e-01  7.70725191e-01
 -6.59495592e-01 -1.99322507e-01  3.69697541e-01 -1.38589454e+00
  4.72655147e-02  1.95070520e-01  5.32901883e-01 -1.99137226e-01
  1.50625181e+00  9.46245566e-02 -3.89028996e-01 -8.50002408e-01
 -1.28851485e+00  4.51728255e-01 -1.18132100e-01  7.44592845e-02
 -8.70879292e-01 -3.04195613e-01  5.88325977e-01  3.06757420e-01
 -2.28482142e-01 -4.64737356e-01  3.85917068e-01 -2.75615960e-01
  1.26702785e+00 -2.96556443e-01  2.23576158e-01 -1.45893320e-01
 -1.18200529e+00  3.55330199e-01  1.62914023e-02  4.09404814e-01
  1.53746475e-02 -7.93889105e-01 -4.24650759e-01  8.92061174e-01
  1.08023906e+00  6.31982684e-02 -3.06436539e-01 -1.40429929e-01
  7.47833967e-01 -2.04360262e

# PERFORM SEMANTIC SEARCH

In [9]:
# prepare sentence embeddings
copy = np.load('SBERT_preprocessed_explanation_embeddings.npy')
sentence_embeddings = copy.copy() # duplicate embedding data

In [10]:
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", original_queries[queries.index(query)]) # for best reading, output orginial text reather than tokens
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(original_sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: This video about what Vitamin C has been proven to do is amazing! Didn’t know 60 out of 60 cases of polio were cured with Vitamin C in the 1930’s. And yes, it kills viruses.

Top 5 most similar sentences in corpus:
A picture claiming that “Bill Gates’ vaccines” would have paralyzed 496k children in India. (Cosine Score: 0.8098)
Bill Gates faces trial in India for illegally testing vaccines. (Cosine Score: 0.7996)
“Arsenic album-30 homeopathic medicine can prevent coronavirus. Information given by Ministry of AYUSH.” (Cosine Score: 0.7954)
Multiple posts shared repeatedly on Facebook and Twitter since March 2020 claim that Israel has reported zero deaths from the novel coronavirus, which causes the disease COVID-19. The posts also claim Israeli citizens have protected themselves from COVID-19 by drinking a remedy of hot water, lemon and baking soda, which purportedly “kills” the virus. (Cosine Score: 0.7944)
There is evidence that chlorine dioxide cure

# Semantic search after TFIDF ranked search

In [None]:
query_id = 0
# RankedIROutput = open('document_retrieval_results.txt', 'w')

# step1: extract the original queries from file
query_ranked_file = open("queries_toy.txt", "r")
query_ranked_str = query_ranked_file.read()
query_ranked_list = re.split(r'\n', query_ranked_str)

i = 0;
for query in query_ranked_list:
    query_ranked_list[i] = query_ranked_list[i][:-1]
    i = i + 1       # turn the file into the format we want

for query in query_ranked_list:
    query_id = query_id + 1  # according to file, query index starts from 1
    subscore = {}  # structure: nested dict, for each term in query,{{term1:{docID:subscore}},{term2:...}
    query_term = normalise(
        stopping(case_fold(string_tokenise(query))))  # same preprocessing as for indexing
    for term in query_term:
        if term in record:
            doc_list = list(record[term].keys())
            for doc in doc_list:
                tf = len(re.split(r', ', record[term].get(doc)))
                df = len(doc_list)
                N = len(docID_list) # the total number of entries in the dataset
                # print(str(tf),str(df),str(N))
                if term in subscore:  # can only add one level of dictionary at a time
                    subscore[term][doc] = (1 + math.log10(tf)) * math.log10(N / df)  # calculate subscore for each term
                else:
                    subscore[term] = {
                        doc: (1 + math.log10(tf)) * math.log10(N / df)}  # calculate subscore for each term
        else:
            continue

    # handle the case that cannot find any matches
    if subscore == []:
        search_result = []

    # calculate the score of each doc for this query
    else:
        doc_score = {}  # structure: dictionary{docID:score}
        for term in list(subscore.keys()):
            for doc in list(subscore[term].keys()):
                if doc in list(doc_score.keys()):
                    doc_score[doc] = doc_score.get(doc) + subscore[term].get(doc)
                else:
                    doc_score[doc] = subscore[term].get(doc)

    # get the doc_score sorted
    search_result = sorted(doc_score.items(), key=lambda x: x[1], reverse=True)  # nested list: [[docID,score]...]

    # write into submitted file
    count = 0  # provide up to 150 result
    for matched_entry in search_result:
        count = count + 1
        if count > 10:
            break
        RankedIROutput.write(str(query_id) + ',' + str(matched_entry[0]) + ',' + str(round(matched_entry[1],4)) +
                            ' ||' + dataframe.loc[dataframe['docID']==matched_entry[0]]['content'].item() + '\n')

RankedIROutput.close()