In [1]:
import numpy as np
import pandas as pd
import csv
import json
import re
import random

In [2]:
def write_csv(data,fname):
    # Write data to CSV
    with open(fname+".csv", "w", newline=",") as f:
        writer = csv.writer(f)
        writer.writerows(data)
    return writer

def read_csv(fname):
    # Read data from CSV
    with open(fname+".csv", "r") as f:
        reader = csv.reader(f)
        data = [row for row in reader]
    return data

def write_text_file(data,fname):
    with open(fname+'.txt', 'w') as output:
        for row in data:
            output.write(row)
            output.write("\n")

# Create training data set for labeling

In [3]:
# Read in MIDAS people data
MIDAS_data = pd.read_csv("data/midas_network_profs_to_aa.csv")
MIDAS_people = MIDAS_data['midas_name']
MIDAS_IDs = MIDAS_data['aa_id']

In [4]:
# Get DOIs of papers from MIDAS people
journal_articles = pd.read_csv("data/Journal_Articles.csv")

In [5]:
# Get DOIs for all MIDAS people
MIDAS_DOIs = np.array([])

for person_id in MIDAS_IDs:
    doi_list = journal_articles.loc[journal_articles['PersonId'] == person_id, 'DOI'].tolist()
    MIDAS_DOIs = np.concatenate((MIDAS_DOIs,doi_list))

In [6]:
def clean_abstract(abstract):
    if not isinstance(abstract, str):
        return abstract
    
    # Remove HTML tags
    abstract = re.sub(r'<.*?>', '', abstract)
    
    # Remove leading "abstract:" or "Abstract:"
    abstract = re.sub(r'^(abstract:|Abstract:)\s*', '', abstract)
    
    return abstract

def clean_title(title):
    if not isinstance(title, str):
        return title
    
    # Remove HTML tags
    title = re.sub(r'<.*?>', '', title)
    
    # Remove leading "abstract:" or "Abstract:"
    title = re.sub(r'^(title:|Title:)\s*', '', title)
    
    return title


In [7]:
# Create list of titles and abstracts for all MIDAS people publications

# Load JSON directory file
with open('data/doi_index.json', 'r') as f:
    linked_directory = json.load(f)

In [8]:
combined_title_abstract_MIDAS = []
for DOI in MIDAS_DOIs:
    DOI = DOI.lower()

    # Open the appropriate abstract file for this DOI
    try:
        abstract = clean_abstract(linked_directory[DOI]['abstract'])
        title = clean_title(linked_directory[DOI]['title'])
        combined_title_abstract_MIDAS.append(title + " " + abstract)
    except:
        continue

In [None]:
# Create list of titles and abstracts from random people (basically guaranteed negatives for training data)
num_samples = 5000

samples = []
count = 0
for doi, entry in linked_directory.items():
    if 'abstract' in entry and 'title' in entry:
        abstract = clean_abstract(entry['abstract'])
        title = clean_title(entry['title'])

        samples.append(title + " " + abstract)
        count += 1
    if count > num_samples:
        break

## Generate training data set

In [10]:
num_MIDAS_papers = 250
num_non_MIDAS_papers = 250

potential_positives = np.random.choice(combined_title_abstract_MIDAS, num_MIDAS_papers, replace=False)
potential_negatives = np.random.choice(samples, num_non_MIDAS_papers, replace=False)

total_data = np.concatenate((potential_positives,potential_negatives))

# save to .txt file
write_text_file(total_data,'output/abstracts_to_label_set2')
