In [None]:
!pip install json
!pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz
!pip install transformers
!pip install pyvis
!pip install spacy-transformers

In [1]:
import re
from sys import stdin
import os
import json
import spacy
from transformers import pipeline
import torch


# Allocate a pipeline for sentiment-analysis
classifier = pipeline("text-classification", model="j-hartmann/sentiment-roberta-large-english-3-classes")
#second_opinion = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Prepare for inference using nli-roberta-base
inference = pipeline('zero-shot-classification', model='roberta-large-mnli')
inference_labels = ['entailment', 'neutral', 'contradiction']

#folder containing the text files
documents_folder = './texts'

#Regex necessary for identifying citations in text
author = "(?:[A-Z][A-Za-z'`-]+)"
etal = "(?:et al.?)"
additional = "(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = "(?:19|20)[0-9][0-9]"
page_num = "(?:, p.? [0-9]+)?"
year = "(?:,? *" + year_num + page_num + "| *\(" + year_num + page_num + "\))"
name_year_regex = "(" + author + additional + "*" + year + ")"

#In case the regex type above is not used we assume the regex might look like this: [1]
num_bracket_regex = r"(?:\[\d+\])"

def get_docs(documents_folder):
    files = []
    for file_name in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file_name)
        if os.path.isfile(file_path):
            files.append(file_path)
    return files


def check_if_sentence_matches(text, regex):
    pattern = re.compile(regex)
    match = pattern.search(text)
    return match is not None


def extract_text_from_file(text_file):
    print(f"Reading {text_file}")

    with open(text_file, 'r') as file:
        text = file.read().replace('\n', ' ')

    return text

# Consider removing
#def check_if_neutral(sen, original_value):
#    sentiment = second_opinion(sen)[0]
#
#    rate = sentiment['label'][0]
#
#    modifier = 0
#
#    for i in range(1,6):
#      if str(i) == rate:
#        modifier = -1 + (0.4 * i) + ((1- sentiment['score']) * (-0.4))
#
#    return (1 - original_value) * modifier


def mapping_function(sen, sentiment, value):
    mapped_value = value

    if sentiment == 'negative':
      mapped_value = -value

    if sentiment == 'neutral':
      mapped_value = 0

    return mapped_value


def format_sentiment_result(sen):
    data = classifier(sen.text)[0]

    return mapping_function(sen.text, data["label"], data["score"])

def inference_calculation(sen, inferring_sen):
  sequence_to_classify = sen + " " + inferring_sen

  result = inference(sequence_to_classify, inference_labels)
  print(result)
  labels = result['labels']

  return labels[0]

def get_score_modifier(inference_result):
    if inference_result == 'entailment' or inference_result == 'contradiction':
      return 1

    return 0

def get_modified_score(citation_section, sen_num, cit_sen):
  score = citation_section[sen_num][1]
  if abs(score) < 0.2:
    return 0
  if sen_num == -1:
    inference_result = inference_calculation(citation_section[sen_num][0], cit_sen)
  if sen_num == 1:
    inference_result = inference_calculation(cit_sen, citation_section[sen_num][0])

  return get_score_modifier(inference_result) * score

def calculate_score(citation_section):
  cit_sen = citation_section[0][0]
  cit_score = citation_section[0][1]
  prev_score = 0
  next_score = 0

  if -1 in citation_section:
    prev_score = get_modified_score(citation_section, -1, cit_sen)

  if 1 in citation_section:
    next_score = get_modified_score(citation_section, 1, cit_sen)

  return cit_score + prev_score + next_score


def analyze_citation_sentences(sens, regex):
    prev_sen = None
    citation_json = []
    #citation_text = ""

    for sen, next_sen in zip(sens, sens[1:] + [None]):
        if check_if_sentence_matches(sen.text, regex):
            sen_value = format_sentiment_result(sen);
            citation_section = {
                0: [sen.text , sen_value]
            }

            if prev_sen and not check_if_sentence_matches(prev_sen.text, regex):
                prev_value = format_sentiment_result(prev_sen)
                #citation_text = citation_text + " " + prev_sen.text
                citation_section = {
                  -1: [prev_sen.text, prev_value],
                  0: [sen.text, sen_value]
                }

                #citation_text = citation_text + " " + sen.text

            if next_sen and not check_if_sentence_matches(next_sen.text, regex):
                next_value = format_sentiment_result(next_sen)
                #citation_text = citation_text + " " + next_sen.text
                citation_section[1] = [next_sen.text, next_value]

            citation_section["final_score"] = calculate_score(citation_section)
            citation_json.append(citation_section)

        prev_sen = sen

    return citation_json  #, citation_text

def extract_citation_sections(text, nlp):
    doc = nlp(text)
    sens = list(doc.sents)

    regex = name_year_regex + "|" + num_bracket_regex

    citation_json = analyze_citation_sentences(sens, regex)#, citation_text = analyze_citation_sentences(sens, regex)

    return citation_json  #, citation_text


def save_citation_sections(text_file, json_dir, nlp):
    text = extract_text_from_file(text_file)
    paperId = os.path.basename(text_file).replace(".txt", "")

    citation_json = extract_citation_sections(text, nlp) #, citation_text = extract_citation_sections(text, nlp)

    json_data = {
        "id": paperId,
        "sentences": citation_json
    }

    #text_path = os.path.join(json_dir, paperId+"-citations.txt")
    file_path = os.path.join(json_dir, paperId+".json")

    #with open(text_path, "w") as file:
    #    file.write(citation_text)

    with open(file_path, "w") as file:
        json.dump(json_data, file, indent=4)

    print(f"Citation sections written to {file_path}")


docs = get_docs(documents_folder)
nlp = spacy.load('en_core_sci_scibert')
original_dir_name = "./jsonFiles"
cnt = 1
if os.path.exists(original_dir_name):
    while os.path.exists(f"{original_dir_name}_({cnt})"):
        cnt += 1
    original_dir_name = original_dir_name + f"_({cnt})"

os.makedirs(original_dir_name)

for doc in docs:
    print("_________________________________________________________________________________________________________________________")
    save_citation_sections(doc, original_dir_name, nlp)

print("Citations extracted!")

  from .autonotebook import tqdm as notebook_tqdm


_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2017.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2017.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2018.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2018.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2020.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2020.json
_________________________________________________________________________________________________________________________
Reading ./texts\BosselutEtAl2019.txt
Citation sections written to ./jsonFiles_(1)\BosselutEtAl2019.json
______________________________________________________________________________________________________

In [None]:
import json
import os
import re

# Regex necessary for identifying citations in text
author = "(?:[A-Z][A-Za-z'`-]+)"
etal = "(?:et al.?)"
additional = "(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = "(?:19|20)[0-9][0-9]"
page_num = "(?:, p.? [0-9]+)?"
year = "(?:,? *" + year_num + page_num + "| *\(" + year_num + page_num + "\))"
name_year_regex = "(" + author + additional + "*" + year + ")"

# In case the regex type above is not used we assume the regex might look like this: [1]
num_bracket_regex = r"(?:\[\d+\])"

regex = name_year_regex + "|" + num_bracket_regex

documents_folder= "./jsonFiles"

docs = get_docs(documents_folder)

def get_cited_papers(sentence):
  return re.findall(regex, sentence)

def id_sentiment_id(citer, sentences):
  entities_relations = []

  for sentence in sentences:
    score = sentence["final_score"]
    citation_sen = sentence["0"]
    for cited in get_cited_papers(citation_sen[0]):
      citation_relations = [citer, score, cited]
      entities_relations.append(citation_relations)

  return entities_relations

graphs = []

for doc in docs:
  # Read the JSON data from the file
  with open(doc, 'r') as json_file:
    data = json.load(json_file)

  id = data["id"]
  sentences = data["sentences"]
  print(f"Reading: {id}")

  graph = id_sentiment_id(id, sentences)
  graphs.append(graph)

  print(graph)
  print("___________________________________________________________________________________________")

In [None]:
import pyvis
from pyvis import network as net
import networkx as nx
from IPython.core.display import display, HTML

def gradient_color(mapped_value):
    # Define the color points and colors for the gradient
    color_points = [-1, -0.5, 0.5]
    colors = ['#9B3131', '#3F7190', '#1D9A60']
    color = ''

    for i in range(3):
      if mapped_value > color_points[i]:
        color = colors[i]

    return color

g=net.Network(notebook=True, cdn_resources='in_line')#, directed=True)

node_list = []
rel_dict = []

for graph in graphs:
  for nodes_rel in graph:
    citer = nodes_rel[0]
    cited = re.sub(r'[ ,.()]', '', nodes_rel[2]).replace('et','Et').replace('al','Al')
    if not citer in node_list:
      node_list.append(citer)
    if not cited in node_list:
      node_list.append(cited)

    rel_color = gradient_color(nodes_rel[1])
    edge_width = abs(nodes_rel[1]) + 0.5

    citer_ref = node_list.index(citer)
    cited_ref = node_list.index(cited)

    g.add_node(citer_ref, label = citer)
    g.add_node(cited_ref, label = cited, color = rel_color)
    g.add_edge(citer_ref, cited_ref, color = rel_color, width = (edge_width*edge_width))




g.show("WIP_KG.html")
display(HTML('WIP_KG.html'))