In [None]:
!pip install json
!pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz
!pip install transformers
!pip install pyvis
!pip install spacy-transformers

In [2]:
import re
import sys
from sys import stdin
import os
import json
from transformers import pipeline
import torch
import spacy
import pyvis
from pyvis import network as net
import networkx as nx
from IPython.core.display import display, HTML
sys.path.append("code")
import citation_data_manipulation as manipulate_data

# Check if gpu is available
gpu_available = torch.cuda.is_available()

# Prepare various pipelines based on whether gpu is available
if gpu_available:
  print("Using GPU")
# Prepare a pipeline for sentiment-analysis
  classifier = pipeline("text-classification", model="j-hartmann/sentiment-roberta-large-english-3-classes", device=0)
# Prepare for Natural Language Inference (NLI) using nli-roberta-base
#  inference = pipeline('zero-shot-classification', model='roberta-large-mnli', device=0)
# Prepare Name Entity Recognition (NER)
  ner = pipeline("token-classification", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple", device=0)
else:
# Prepare a pipeline for sentiment-analysis
  classifier = pipeline("text-classification", model="j-hartmann/sentiment-roberta-large-english-3-classes")
# Prepare for Natural Language Inference (NLI) using nli-roberta-base
#  inference = pipeline('zero-shot-classification', model='roberta-large-mnli')
# Prepare Name Entity Recognition (NER)
  ner = pipeline("token-classification", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple")
inference_labels = ['entailment', 'neutral', 'contradiction']

# Load en_core_sci_scibert model to separate sentences
nlp = spacy.load('en_core_sci_scibert')


# Define input and output folders
input_folder = './texts'
output_folder = "./jsonFiles"
#Initiallize lists and dictionaries for graphs and paper data
graphs = []
# For paper identification
paper_data = {}

#Regex necessary for identifying citations in text
author = "(?:[A-Z][A-Za-z'`-]+)"
etal = "(?:et al.?)"
additional = "(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = "(?:19|20)[0-9][0-9]"
page_num = "(?:, p.? [0-9]+)?"
year = "(?:,? *" + year_num + page_num + "| *\(" + year_num + page_num + "\))"
name_year_regex = "(" + author + additional + "*" + year + ")"
#In case the regex type above is not used we assume the regex might look like this: [1]
num_bracket_regex = r"(\[\d+\])"
regex = name_year_regex + "|" + num_bracket_regex

# Start of json to node and edge creation
def get_cited_papers(paper_id, sentence):
  using_et_al = False
  if '[' in sentence:
    citations = re.findall(num_bracket_regex, sentence)
  else:
    citations = re.findall(name_year_regex, sentence)
    using_et_al = True
  global_ids = []

  for citation in citations:
    if using_et_al:
      cit_id = citation
    else:
      cit_id = citation[1].replace('[','').replace(']','')
    global_ids.append(manipulate_data.append_reference_data(paper_data, paper_id, cit_id))

  return global_ids

def id_sentiment_id(citer, sentences):
  entities_relations = []

  for sentence in sentences:
    score = sentence["final_score"]
    cited_papers = sentence["0"][2]
    for cited in cited_papers:
      topic = ner(sentence["0"][0])
      citation_relations = [citer, score, cited, topic]
      entities_relations.append(citation_relations)

  return entities_relations
# End of json to node and edge creation


# Start of KG generation
def gradient_color(mapped_value):
    # Define the color points and colors for the gradient
    color_points = [-1, -0.1, 0]
    colors = ['#9B3131', '#90e0ef', '#1D9A60']
    color = ''

    for i in range(3):
      if mapped_value > color_points[i]:
        color = colors[i]

    return color

# get the label to put on the node based on the global ID
def get_citation_from_global_id(global_id):
  if isinstance(global_id, int):
    author = paper_data[global_id]["Author"]
    title = paper_data[global_id]["Title"]
    year = paper_data[global_id]["Year"]
    node = f"{author}, {year}"
    if len(title) == 0:
      title = "Title not found!"
    return node, title
  for paper in paper_data:
    if paper == global_id:
      author = paper_data[paper]["Author"]
      title = paper_data[paper]["Title"]
      year = paper_data[paper]["Year"]
      node = f"{author}, {year}"
      if len(title) == 0:
        title = "Title not found!"
      return node, title
  return global_id, ""

# get the global_ID number/code based on the citation ID, specific to the citing papers
def get_ref_code(citation_id):
  for paper in paper_data:
    if paper == citation_id:
      return paper

  return len(paper_data)

# After preparing all elements for the graph based on each paper specific graph, generate the nodes and edges accordingly
def visualize_KG(html_name):
  g=net.Network(notebook=True, cdn_resources='in_line')#, directed=True)

  for graph in graphs:
    for nodes_rel in graph:
      citer = nodes_rel[0]
      cited, cited_title = get_citation_from_global_id(nodes_rel[2])
      topics = nodes_rel[3]
      edge_label = ""
      for topic in topics:
        if not topic['entity_group'] == 'PER':
          new_topic = topic['word']
          if len(edge_label) == 0:
            edge_label = new_topic
          else:
            edge_label = f"{edge_label}, {new_topic}"


      rel_color = gradient_color(nodes_rel[1])
      edge_width = abs(nodes_rel[1]) + 0.5

      citer_ref = get_ref_code(citer)
      cited_ref = nodes_rel[2]

      citer, citer_title = get_citation_from_global_id(citer)

      if ', 0' in cited:
        continue

      g.add_node(citer_ref, label = citer, title = citer_title)
      g.add_node(cited_ref, label = cited, color = rel_color, title = cited_title)
      if len(edge_label) == 0:
        edge_label = 'No Named Entity found!'
      g.add_edge(citer_ref, cited_ref, color = rel_color, width = (edge_width*edge_width), title = edge_label)

  #g.show(html_name)
  html = g.generate_html()
  with open(html_name, mode='w', encoding='utf-8') as fp:
    fp.write(html)
  display(HTML(html))
# End of KG generation

# For each document get the id of the reference papers to be used as entities
def prepare_entities(documents):
  for doc in documents:
    # Read the JSON data from the file
    with open(doc, 'r') as json_file:
      data = json.load(json_file)

    id = data["id"]
    sentences = data["sentences"]

    graph = id_sentiment_id(id, sentences)
    graphs.append(graph)

# Function that gets the documents availble in a folder
def get_docs(documents_folder):
    files = []
    for file_name in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file_name)
        if os.path.isfile(file_path):
            files.append(file_path)
    return files

# load if only KG is needed to be generated
def load_paper_data(file_path):
  with open(file_path, 'r') as json_file:
    paper_data = json.load(json_file)

# Check if a given text contains a citation
def has_citation(text):
    pattern = re.compile(regex)
    match = pattern.search(text)
    return match is not None

# Change the result of the Sent. Analysis so as to have the score go from ]-1;1[
def format_sentiment_result(sen):
    data = classifier(sen.text)[0]
    sentiment = data["label"]
    value = data["score"]

    if sentiment == 'negative':
      value = -value
    elif sentiment == 'neutral':
      value = 0

    return value

def get_context_from_noun_phrases(sen1, sen2):
    doc = nlp(sen2)
    print(sen2)
    for token2 in doc:
        print(token2.text + "--> " + token2.head.text) 
        if token2.head.text in sen1:
            return True
    return False

def get_context_from_similarity(sen1, sen2):
  doc1 = nlp(sen1)
  doc2 = nlp(sen2)

  similarity = doc1.similarity(doc2)
  return similarity

def prepare_citation_sections(paper_id, sens):
  prev_sen = None
  prev_context_value = next_context_value = 0
  citation_json = []

  for sen, next_sen in zip(sens, sens[1:] + [None]):
    if has_citation(sen.text):
      sen_value = format_sentiment_result(sen)
      cited_papers = get_cited_papers(paper_id, sen.text)
      citation_section = {}
      #named_entities = ner(sen.text)
      noun_phrases = nlp(sen.text).noun_chunks

      if prev_sen and not has_citation(prev_sen.text):
        print("taken into consideration")
        #context = get_context(prev_sen.text, sen.text)
        #context_found = get_context_from_ner(prev_sen.text, named_entities)
        context_found = get_context_from_noun_phrases(prev_sen.text, sen.text)
        #similarity = get_context_from_similarity(prev_sen.text, sen.text)
        prev_context_value = 0
        #if not context == 'neutral':
        if context_found:
          #prev_context_value = get_contextual_result(prev_sen, '', True)
          prev_context_value = format_sentiment_result(prev_sen)
          citation_section[-1] = [prev_sen.text, prev_context_value]

      citation_section[0] = [sen.text, sen_value, cited_papers]

      if next_sen and not has_citation(next_sen.text):
        #context = get_context(sen.text, next_sen.text)
        #context_found = get_context_from_ner(next_sen.text, named_entities)
        context_found = get_context_from_noun_phrases(sen.text, next_sen.text)
        #similarity = get_context_from_similarity(sen.text, next_sen.text)
        next_context_value = 0
        #if not context == 'neutral':
        if context_found:
          #next_context_value = get_contextual_result(next_sen, '', False)
          next_context_value = format_sentiment_result(next_sen)
          citation_section[1] = [next_sen.text, next_context_value]
    
      citation_section["final_score"] = sen_value + prev_context_value + next_context_value
      citation_json.append(citation_section)

    prev_sen = sen

  return citation_json


def text_to_json(paper_id, text, nlp):
    doc = nlp(text)
    sens = list(doc.sents)

    citation_json = prepare_citation_sections(paper_id, sens)

    return citation_json


def extract_text_from_file(text_file):
    print(f"Reading {text_file}")
    title = ''

    with open(text_file, 'r') as file:
        text = file.read()

    if 'Title: ' in text:
      title = text.split('\n')[0].replace('Title: ', '')
      text = text.split("\n",1)[1].replace('\n', ' ')

    return text, title


def save_citation_sections(text_file, json_dir, nlp):
    text, paper_id = extract_citing_data(text_file)

    citation_json = text_to_json(paper_id, text, nlp)

    json_data = {
        "id": paper_id,
        "sentences": citation_json
    }

    file_path = os.path.join(json_dir, paper_id+".json")

    with open(file_path, "w") as file:
        json.dump(json_data, file, indent=4)

    print(f"  Citation sections written to {file_path}")


def get_citing_author_year(citing_paper_id):
  publication_year = re.search(r"\d{4}$", citing_paper_id).group()

  author =  citing_paper_id.replace(publication_year, '')

  author = re.sub("EtAl$", '', author, flags=re.IGNORECASE).replace(r".*And.*", " and ")

  return author, publication_year


def extract_citing_data(text_file):
  text, title = extract_text_from_file(text_file)
  paper_id = os.path.basename(text_file).replace(".txt", "")
  author, publication_year = get_citing_author_year(paper_id)

  paper_data[paper_id] = {
      "Author": author,
      "Title": title,
      "Year": publication_year
  }

  return text, paper_id


only_kg_generation = False
if not only_kg_generation:
  docs = get_docs(input_folder)
  sorted_docs = sorted(docs)
  cnt = 1
  if os.path.exists(output_folder):
    while os.path.exists(f"{output_folder}_({cnt})"):
      cnt += 1
    output_folder = output_folder + f"_({cnt})"

  os.makedirs(output_folder)

  for doc in sorted_docs:
    if not 'bibliography' in doc:
      print("_________________________________________________________________________________________________________________________")
      save_citation_sections(doc, output_folder, nlp)

print("Citations extracted!")

if only_kg_generation:
  output_folder = "./jsonFiles"
  load_paper_data("./citations.json")

json_docs = get_docs(output_folder)
prepare_entities(json_docs)

if not only_kg_generation:
  with open('citations.json', "w") as file:
    json.dump(paper_data, file, indent=2)

visualize_KG('WIP_KG.html')

  from IPython.core.display import display, HTML
Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing Rob

_________________________________________________________________________________________________________________________
Reading .\texts\YousifEtAl2017.txt
  Citation sections written to .\jsonFiles\YousifEtAl2017.json
Citations extracted!
small_presentation_demo.html


UnicodeEncodeError: 'charmap' codec can't encode characters in position 263607-263621: character maps to <undefined>