In [None]:
!pip install json
!pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install transformers

In [1]:
import re
from sys import stdin
import os
import json
import spacy
from transformers import pipeline

# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')

#folder containing the text files
documents_folder = './texts'

#Regex necessary for identifying citations in text
author = "(?:[A-Z][A-Za-z'`-]+)"
etal = "(?:et al.?)"
additional = "(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = "(?:19|20)[0-9][0-9]"
page_num = "(?:, p.? [0-9]+)?"
year = "(?:,? *" + year_num + page_num + "| *\(" + year_num + page_num + "\))"
name_year_regex = "(" + author + additional + "*" + year + ")"

#In case the regex type above is not used we assume the regex might look like this: [1] or (2)
num_bracket_regex = r"(?:\[\d+\])"

def get_docs(documents_folder):
    files = []
    for file_name in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file_name)
        if os.path.isfile(file_path):
            files.append(file_path)
    return files


def check_if_sentence_matches(text, regex):
    pattern = re.compile(regex)
    match = pattern.search(text)
    return match is not None


def extract_text_from_file(text_file):
    """
    This funciton returns sentences and its corresponding text in case it is needed.

    Args:
        text_file (string): file path

    Returns:
        string: full text in file
        array: all sentences in text
    """
    print(f"Reading {text_file}")

    with open(text_file, 'r') as file:
        text = file.read().replace('\n', ' ')

    return text


def extract_citation_sections(text, nlp):
    doc = nlp(text)
    sens = list(doc.sents)
    citation_text = ""

    citation_json = []
    prev_sen = None

    regex = name_year_regex + "|" + num_bracket_regex

    for sen, next_sen in zip(sens, sens[1:] + [None]):
        if check_if_sentence_matches(sen.text, regex):
            sen_value = classifier(sen.text);
            citation_section = {
                "citation_sentence": [sen.text , sen_value]
            }

            if prev_sen and not check_if_sentence_matches(prev_sen.text, regex):
                prev_value = classifier(prev_sen.text)
                citation_text = citation_text + " " + prev_sen.text
                citation_section = {
                    "previous_sentence": [prev_sen.text, prev_value],
                    "citation_sentence": [sen.text, sen_value]
                }

            citation_text = citation_text + " " + sen.text

            if next_sen and not check_if_sentence_matches(next_sen.text, regex):
                next_value = classifier(next_sen.text)
                citation_text = citation_text + " " + next_sen.text
                citation_section["next_sentence"] = [next_sen.text, next_value]

            citation_json.append(citation_section)

        prev_sen = sen

    return citation_json, citation_text


def save_citation_sections(text_file, json_dir, nlp):
    text = extract_text_from_file(text_file)

    citation_json, citation_text = extract_citation_sections(text, nlp)

    json_data = {
        "id": "example",
        "sentences": citation_json
    }

    text_path = os.path.join(json_dir, os.path.basename(text_file).replace(".txt", "-citations.txt"))
    file_path = os.path.join(json_dir, os.path.basename(text_file).replace(".txt",".json"))

    with open(text_path, "w") as file:
        file.write(citation_text)

    with open(file_path, "w") as file:
        json.dump(json_data, file, indent=4)

    print(f"Citation sections written to {file_path}")


docs = get_docs(documents_folder)
nlp = spacy.load('en_core_sci_sm')
original_dir_name = "./jsonFiles"
cnt = 1
if os.path.exists(original_dir_name):
    while os.path.exists(f"{original_dir_name}_({cnt})"):
        cnt += 1
    original_dir_name = original_dir_name + f"_({cnt})"

os.makedirs(original_dir_name)

for doc in docs:
    print("_________________________________________________________________________________________________________________________")
    save_citation_sections(doc, original_dir_name, nlp)

print("Citations extracted!")

  from .autonotebook import tqdm as notebook_tqdm


_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2017.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2017.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2018.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2018.json
_________________________________________________________________________________________________________________________
Reading ./texts\AburaedEtAl2020.txt
Citation sections written to ./jsonFiles_(1)\AburaedEtAl2020.json
_________________________________________________________________________________________________________________________
Reading ./texts\BosselutEtAl2019.txt
Citation sections written to ./jsonFiles_(1)\BosselutEtAl2019.json
______________________________________________________________________________________________________