In [None]:
!pip install json
!pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz
!pip install transformers
!pip install pyvis
!pip install spacy-transformers

In [2]:
import re
import sys
from sys import stdin
import os
import json
from transformers import pipeline
import torch
import spacy
import pyvis
from pyvis import network as net
import networkx as nx
from IPython.core.display import display, HTML
sys.path.append("code")
import citation_data_manipulation as manipulate_data

# Prepare a pipeline for sentiment-analysis
classifier = pipeline("text-classification", model="j-hartmann/sentiment-roberta-large-english-3-classes")

# Prepare for Natural Language Inference (NLI) using nli-roberta-base
inference = pipeline('zero-shot-classification', model='roberta-large-mnli')
inference_labels = ['entailment', 'neutral', 'contradiction']

# Load en_core_sci_scibert model to separate sentences
nlp = spacy.load('en_core_sci_scibert')

# Prepare Name Entity Recognition (NER)
ner = pipeline("token-classification", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple")

# input
input_folder = './texts'
output_folder = "./jsonFiles"

#Regex necessary for identifying citations in text
author = "(?:[A-Z][A-Za-z'`-]+)"
etal = "(?:et al.?)"
additional = "(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = "(?:19|20)[0-9][0-9]"
page_num = "(?:, p.? [0-9]+)?"
year = "(?:,? *" + year_num + page_num + "| *\(" + year_num + page_num + "\))"
name_year_regex = "(" + author + additional + "*" + year + ")"

#In case the regex type above is not used we assume the regex might look like this: [1]
num_bracket_regex = r"(\[\d+\])"
regex = name_year_regex + "|" + num_bracket_regex
graphs = []
# For paper identification
paper_data = {}

# Start of json to node and edge creation

def get_cited_papers(paper_id, sentence):
  using_et_al = False
  if '[' in sentence:
    citations = re.findall(num_bracket_regex, sentence)
  else:
    citations = re.findall(name_year_regex, sentence)
    using_et_al = True
  global_ids = []

  for citation in citations:
    if using_et_al:
      cit_id = citation
    else:
      cit_id = citation[1].replace('[','').replace(']','')
    global_ids.append(manipulate_data.append_reference_data(paper_data, paper_id, cit_id))

  return global_ids


def id_sentiment_id(citer, sentences):
  entities_relations = []

  for sentence in sentences:
    score = sentence["final_score"]
    cited_papers = sentence["0"][2]
    for cited in cited_papers:
      citation_relations = [citer, score, cited]
      entities_relations.append(citation_relations)

  return entities_relations
# End of json to node and edge creation


# Start of KG generation
def gradient_color(mapped_value):
    # Define the color points and colors for the gradient
    color_points = [-1, -0.5, 0.5]
    colors = ['#9B3131', '#3F7190', '#1D9A60']
    color = ''

    for i in range(3):
      if mapped_value > color_points[i]:
        color = colors[i]

    return color


def get_cited_from_global_id(global_id):
  if isinstance(global_id, int):
    author = paper_data[global_id]["Author"]
    title = paper_data[global_id]["Title"]
    year = paper_data[global_id]["Year"]
    title_first_words = ' '.join(title.split()[:3])
    node = f"{author}, {year}, {title_first_words}..."
    return node
  return global_id


def get_ref_code(citation_id):
  for paper in paper_data:
    data = paper_data[paper]
    if data["ID"] == citation_id:
      return paper

  return len(paper_data)


def visualize_KG(html_name):
  g=net.Network(notebook=True, cdn_resources='in_line')#, directed=True)

  for graph in graphs:
    for nodes_rel in graph:
      citer = nodes_rel[0]
      cited = get_cited_from_global_id(nodes_rel[2])

      rel_color = gradient_color(nodes_rel[1])
      edge_width = abs(nodes_rel[1]) + 0.5

      citer_ref = get_ref_code(citer)
      cited_ref = nodes_rel[2]

      citer = get_cited_from_global_id(citer)

      g.add_node(citer_ref, label = citer)
      g.add_node(cited_ref, label = cited, color = rel_color)
      g.add_edge(citer_ref, cited_ref, color = rel_color, width = (edge_width*edge_width))

  g.show(html_name)
  display(HTML(html_name))
# End of KG generation


def prepare_entities(documents):
  for doc in documents:
    # Read the JSON data from the file
    with open(doc, 'r') as json_file:
      data = json.load(json_file)

    id = data["id"]
    sentences = data["sentences"]

    graph = id_sentiment_id(id, sentences)
    graphs.append(graph)


def get_docs(documents_folder):
    files = []
    for file_name in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file_name)
        if os.path.isfile(file_path):
            files.append(file_path)
    return files


def has_citation(text):
    pattern = re.compile(regex)
    match = pattern.search(text)
    return match is not None


def format_sentiment_result(sen):
    data = classifier(sen.text)[0]
    sentiment = data["label"]
    value = data["score"]

    if sentiment == 'negative':
      value = -value
    elif sentiment == 'neutral':
      value = 0

    return value


def get_context(sen, inferring_sen):
  sequence_to_classify = sen + " " + inferring_sen

  result = inference(sequence_to_classify, inference_labels)
  labels = result['labels']

  return labels[0]


def get_contextual_result(sentence, context, is_prev_sentence):
  result_modifier = 1
  sentiment_value = format_sentiment_result(sentence)
  if context == 'contradiction':
    if is_prev_sentence:
      result_modifier = -1
  else:
    if not is_prev_sentence:
      result_modifier = -1

  return result_modifier * sentiment_value


def prepare_citation_sections(paper_id, sens):
  prev_sen = None
  prev_context_value = next_context_value = 0
  citation_json = []

  for sen, next_sen in zip(sens, sens[1:] + [None]):
    if has_citation(sen.text):
      sen_value = format_sentiment_result(sen)
      cited_papers = get_cited_papers(paper_id, sen.text)
      citation_section = {}

      if prev_sen and not has_citation(prev_sen.text):
        context = get_context(prev_sen.text, sen.text)
        if not context == 'neutral':
          prev_context_value = get_contextual_result(prev_sen, context, True)
          citation_section[-1] = [prev_sen.text, prev_context_value]

      citation_section[0] = [sen.text, sen_value, cited_papers]

      if next_sen and not has_citation(next_sen.text):
        context = get_context(sen.text, next_sen.text)
        if not context == 'neutral':
          next_context_value = get_contextual_result(next_sen, context, False)
          citation_section[1] = [next_sen.text, next_context_value]

      citation_section["final_score"] = sen_value + prev_context_value + next_context_value
      citation_json.append(citation_section)

  prev_sen = sen

  return citation_json


def text_to_json(paper_id, text, nlp):
    doc = nlp(text)
    sens = list(doc.sents)

    citation_json = prepare_citation_sections(paper_id, sens)

    return citation_json


def extract_text_from_file(text_file):
    print(f"Reading {text_file}")
    title = ''

    with open(text_file, 'r') as file:
        text = file.read()

    if 'Title: ' in text:
      title = text.split('\n')[0].replace('Title: ', '')
      text = text.split("\n",1)[1].replace('\n', ' ')

    return text, title


def save_citation_sections(text_file, json_dir, nlp):
    text, paper_id = extract_citing_data(text_file)

    citation_json = text_to_json(paper_id, text, nlp)

    json_data = {
        "id": paper_id,
        "sentences": citation_json
    }

    file_path = os.path.join(json_dir, paper_id+".json")

    with open(file_path, "w") as file:
        json.dump(json_data, file, indent=4)

    print(f"  Citation sections written to {file_path}")


def get_citing_author_year(citing_paper_id):
  publication_year = re.search(r"\d{4}$", citing_paper_id).group()

  author =  citing_paper_id.replace(publication_year, '')

  author = re.sub("EtAl$", '', author, flags=re.IGNORECASE).replace(r".*And.*", " and ")

  return author, publication_year


def extract_citing_data(text_file):
  text, title = extract_text_from_file(text_file)
  paper_id = os.path.basename(text_file).replace(".txt", "")
  author, publication_year = get_citing_author_year(paper_id)

  paper_data[len(paper_data)] = {
      "ID": paper_id,
      "Author": author,
      "Title": title,
      "Year": publication_year
  }

  return text, paper_id


docs = get_docs(input_folder)
cnt = 1
if os.path.exists(output_folder):
  while os.path.exists(f"{output_folder}_({cnt})"):
    cnt += 1
  output_folder = output_folder + f"_({cnt})"

os.makedirs(output_folder)

for doc in docs:
  if not 'bibliography' in doc:
    print("_________________________________________________________________________________________________________________________")
    save_citation_sections(doc, output_folder, nlp)

print("Citations extracted!")

json_docs = get_docs(output_folder)
prepare_entities(json_docs)

print(paper_data)
with open('citations.json', "w") as file:
  json.dump(paper_data, file, indent=2)

visualize_KG('WIP_KG.html')

  from IPython.core.display import display, HTML


Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2017.txt




  Citation sections written to ./jsonFiles\AburaedEtAl2017.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2018.txt
  Citation sections written to ./jsonFiles\AburaedEtAl2018.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2020.txt
  Citation sections written to ./jsonFiles\AburaedEtAl2020.json
_________________________________________________________________________________________________________________________
Reading ./texts\BosselutEtAl2019.txt
  Citation sections written to ./jsonFiles\BosselutEtAl2019.json
_________________________________________________________________________________________________________________________
Reading ./texts\CarlsonEtAl2010.txt


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 637: character maps to <undefined>