## The purpose of this notebook is to prep the raw data for citation network analyses

## Packages and Modules

In [1]:
# Data Manipulation
import pandas as pd 
import numpy as np 

# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns 

# NLP
import nltk 
nltk.data.path.append("../models/")
from nltk.util import ngrams  
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re 

# Preferences
from IPython.display import clear_output

## The dataset

In [2]:
raw_data = pd.read_csv("../data/02_intermediate/newlines_apos_removed_paragraphs_added.csv")
data = raw_data.copy()
data.sample(5)

Unnamed: 0,article_title,authors,abstract,full_text,citation_info,processed,paragraph
3290,A modified incidental-teaching procedure for a...,"G G McGee, P J Krantz, D Mason, and L E McCla...",A modified incidental-teaching procedure was u...,"JOURNAL OF APPLIED BEHAVIOR ANALYSIS\n\n1983, ...",J Appl Behav Anal. 1983 Fall; 16(3): 329–338.,"JOURNAL OF APPLIED BEHAVIOR ANALYSIS 1983, 161...","['JOURNAL OF APPLIED BEHAVIOR ANALYSIS', '1983..."
8419,Predicting Reinforcement: Utility of the Motiv...,"Alan Poling,1 Amin Lotfizadeh,2 and Timothy L...",Other articles detail the historical developme...,BEHAV ANALYST (2017) 40:49–56\nDOI 10.1007/s40...,Behav Anal. 2017 Jun; 40(1): 49–56.,BEHAV ANALYST (2017) 40:49–56 DOI 10.1007/s406...,['BEHAV ANALYST (2017) 40:49–56\nDOI 10.1007/s...
1747,Some Effects of Noncontingent Positive Reinfor...,Einar T IngvarssonYOUNGSTOWN STATE UNIVERSITY,Functional analysis suggested that the problem...,"JOURNAL OF APPLIED BEHAVIOR ANALYSIS\n\n2008, ...",J Appl Behav Anal. 2008 Fall; 41(3): 435–440.,"JOURNAL OF APPLIED BEHAVIOR ANALYSIS 2008, 41,...","['JOURNAL OF APPLIED BEHAVIOR ANALYSIS', '2008..."
6534,Producing a change from competition to sharing...,Dennis R. Olvera and Don F. Hake,Pairs of high-school students matched-to-sampl...,JOURNAL OF THE EXPERIMENTAI. ANALYSIS OF BEHAV...,J Exp Anal Behav. 1976 Nov; 26(3): 321–333.,JOURNAL OF THE EXPERIMENTAI. ANALYSIS OF BEHAV...,['JOURNAL OF THE EXPERIMENTAI. ANALYSIS OF BEH...
5861,Performance in continuously available multiple...,Douglas Elliffe and Michael Davison,Three pigeons were given continuous access in ...,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,J Exp Anal Behav. 1985 Nov; 44(3): 343–353.,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,['JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHA...


In [3]:
# Isolate the journal names
journal_names = []
for citation in data['citation_info'].astype(str):
    # Find the year via regex
    match = re.search(r'\d{4}', citation)
    if match:
        end_index = match.start()
        journal_name = citation[:end_index].strip()
    else:
        journal_name = citation  # If no year is found, assume the whole string is the journal name

    journal_names.append(journal_name)

data['journal'] = journal_names
data['journal'].unique()

array(['Anal Verbal Behav.', 'nan', 'VB News.', 'Behav Anal Pract.',
       'J Appl Behav Anal.', 'J Exp Anal Behav.', 'Perspect Behav Sci.',
       'Behav Anal.'], dtype=object)

In [4]:
# Swap out the journal shorthand with an easier to use abbreviation
journal_dict = {
    'Anal Verbal Behav.': 'TAVB', 
    'VB News.': 'TAVB', 
    'Behav Anal Pract.': 'BAP', 
    'J Appl Behav Anal.': ' JABA', 
    'J Exp Anal Behav.': 'JEAB', 
    'Perspect Behav Sci.': 'PoBS', 
    'Behav Anal': 'PoBS'
}
data['journal'] = data['journal'].map(journal_dict)
data['journal'].unique()

array(['TAVB', nan, 'BAP', ' JABA', 'JEAB', 'PoBS'], dtype=object)

In [5]:
# Replace "- "
data['processed'] = [str(val).replace("- ", "") for val in data['processed']]

In [6]:
# Function for separating the manuscript into the respective sections
def section_isolator(processing_col, section_start_name, next_section_name):
        new_col = []
        for val in processing_col:
            try:
                new_col.append(str(val).lower().split(f"{section_start_name}")[1])
            except:
                new_col.append(val)
        new_col = [str(val).split(f"{next_section_name}")[0] for val in new_col]
        return new_col

In [7]:
# Function to reverse a string
def reverse_string(s):
    return s[::-1]

# Function to find the last part of text after the last occurrence of 'reference'
def get_references_section(text):
    # Reverse the text and the word 'references'
    reversed_text = reverse_string(text.lower())
    reversed_word = reverse_string(" references")

    # Find the index of the last occurrence of 'references'
    idx = reversed_text.find(reversed_word)

    # If 'references' is found, return the part after it; otherwise, return the original text
    if idx != -1:
        # Calculate the actual index in the original text
        actual_idx = len(text) - idx - len("references")
        return text[actual_idx:].strip()
    else:
        return text

In [8]:
# Isolate the references section
new_col = []
for val in data['processed']:
    references_section = get_references_section(val)
    new_col.append(references_section)

data['references'] = new_col
data['references'] = data['references'].astype(str)

## Create a matrix of the count of each article being cited within the others

In [9]:
# Lowercase the titles and references once
data['article_title'] = data['article_title'].str.lower()
data['references'] = data['references'].str.lower()

# Create a dictionary to hold the data
cite_data = {}

# Get unique titles
unique_titles = data['article_title'].unique()
total_titles = len(unique_titles)

# Loop through unique titles
for index, title in enumerate(unique_titles, 1):
    # Escape any special characters in the title for regular expression
    escaped_title = re.escape(title)

    # Use vectorized string contains method with the escaped title
    cite_data[title] = data['references'].str.contains(escaped_title, regex=True).astype(int)

    # Print progress update every 50 articles
    if index % 50 == 0 or index == total_titles:
        clear_output()
        print(f"Processed {index} of {total_titles} titles.")

# Convert the dictionary to a DataFrame
cite_matrix = pd.DataFrame(cite_data)

Processed 9334 of 9334 titles.


In [10]:
cite_matrix.index = data['article_title']
cite_matrix.to_csv("../data/02_intermediate/citation_matrix.csv")