## The purpose of this notebook is to prep the raw data for citation network analyses

## Packages and Modules

In [1]:
# Data Manipulation
import pandas as pd 
import numpy as np 

# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns 

# NLP
import nltk 
nltk.data.path.append("../models/")
from nltk.util import ngrams  
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

import re 

## The dataset

In [2]:
raw_data = pd.read_csv("../data/02_intermediate/newlines_apos_removed_paragraphs_added.csv")
data = raw_data.copy()
data.sample(5)

Unnamed: 0,article_title,authors,abstract,full_text,citation_info,processed,paragraph
6029,Intractable properties of responding under a f...,G. David Gentry and M. Jackson Marr,The behavior engendered by the fixed-interval ...,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,J Exp Anal Behav. 1982 Mar; 37(2): 233–241.,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,['JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHA...
6827,Effects of two procedures for varying informat...,Stephen B. Kendall,Two experiments were conducted with pigeons to...,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,J Exp Anal Behav. 1973 Jul; 20(1): 73–83.,JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHAVI...,['JOURNAL OF THE EXPERIMENTAL ANALYSIS OF BEHA...
1976,An experimental analysis of reading interventi...,"Christine M Bonfiglio, Edward J Daly, 3rd, Bri...",This study examined the generalized effects of...,,J Appl Behav Anal. 2004 Spring; 37(1): 111–114.,,
4545,Naming And Categorization In Young Children: I...,"C. Fergus Lowe, Pauline J Horne, and J. Carl ...","Following pretraining with everyday objects, 1...",(cid:74)(cid:79)(cid:85)(cid:82)(cid:78)(cid:6...,J Exp Anal Behav. 2005 Jan; 83(1): 47–65.,(cid:74)(cid:79)(cid:85)(cid:82)(cid:78)(cid:6...,['(cid:74)(cid:79)(cid:85)(cid:82)(cid:78)(cid...
684,Practical Resources for Talking to Children wi...,"Julie L. Melendez, Isabella Maria C. Tan, Jasm...",Not Applicable,Behavior Analysis in Practice (2021) 14:451–46...,Behav Anal Pract. 2021 Jun; 14(2): 451–461.,Behavior Analysis in Practice (2021) 14:451–46...,['Behavior Analysis in Practice (2021) 14:451–...


In [3]:
# Isolate the journal names
journal_names = []
for citation in data['citation_info'].astype(str):
    # Find the year via regex
    match = re.search(r'\d{4}', citation)
    if match:
        end_index = match.start()
        journal_name = citation[:end_index].strip()
    else:
        journal_name = citation  # If no year is found, assume the whole string is the journal name

    journal_names.append(journal_name)

data['journal'] = journal_names
data['journal'].unique()

array(['Anal Verbal Behav.', 'nan', 'VB News.', 'Behav Anal Pract.',
       'J Appl Behav Anal.', 'J Exp Anal Behav.', 'Perspect Behav Sci.',
       'Behav Anal.'], dtype=object)

In [4]:
# Swap out the journal shorthand with an easier to use abbreviation
journal_dict = {
    'Anal Verbal Behav.': 'TAVB', 
    'VB News.': 'TAVB', 
    'Behav Anal Pract.': 'BAP', 
    'J Appl Behav Anal.': ' JABA', 
    'J Exp Anal Behav.': 'JEAB', 
    'Perspect Behav Sci.': 'PoBS', 
    'Behav Anal': 'PoBS'
}
data['journal'] = data['journal'].map(journal_dict)
data['journal'].unique()

array(['TAVB', nan, 'BAP', ' JABA', 'JEAB', 'PoBS'], dtype=object)

In [5]:
# Replace "- "
data['processed'] = [str(val).replace("- ", "") for val in data['processed']]

In [6]:
# Function for separating the manuscript into the respective sections
def section_isolator(processing_col, section_start_name, next_section_name):
        new_col = []
        for val in processing_col:
            try:
                new_col.append(str(val).lower().split(f"{section_start_name}")[1])
            except:
                new_col.append(val)
        new_col = [str(val).split(f"{next_section_name}")[0] for val in new_col]
        return new_col

In [14]:
# Function to reverse a string
def reverse_string(s):
    return s[::-1]

# Function to find the last part of text after the last occurrence of 'reference'
def get_references_section(text):
    # Reverse the text and the word 'references'
    reversed_text = reverse_string(text.lower())
    reversed_word = reverse_string(" references")

    # Find the index of the last occurrence of 'references'
    idx = reversed_text.find(reversed_word)

    # If 'references' is found, return the part after it; otherwise, return the original text
    if idx != -1:
        # Calculate the actual index in the original text
        actual_idx = len(text) - idx - len("references")
        return text[actual_idx:].strip()
    else:
        return text

In [39]:
# Isolate the references section
new_col = []
for val in data['processed']:
    references_section = get_references_section(val)
    new_col.append(references_section)

data['references'] = new_col
data['references'] = data['references'].astype(str)

Unnamed: 0,article_title,authors,abstract,full_text,citation_info,processed,paragraph,journal,references
0,Teaching Children With Autism Spectrum Disorde...,"Priya Patil,1 Tina M. Sidener,1 Heather Pane,1...",For most children with autism spectrum disorde...,The Analysis of Verbal Behavior (2021) 37:1–16...,Anal Verbal Behav. 2021 Jun; 37(1): 1–16.,The Analysis of Verbal Behavior (2021) 37:1–16...,['The Analysis of Verbal Behavior (2021) 37:1–...,TAVB,References American Speech-Language-Hearing As...
1,Teaching Children with Autism to Mand for Info...,"Megan L. Pyles, Amanda N. Chastain, and Caio ...",The current study evaluated a procedure used t...,The Analysis of Verbal Behavior (2021) 37:17–3...,Anal Verbal Behav. 2021 Jun; 37(1): 17–34.,The Analysis of Verbal Behavior (2021) 37:17–3...,['The Analysis of Verbal Behavior (2021) 37:17...,TAVB,References American Psychiatric Association. (...
2,An Evaluation of the Emergence of Untrained Ac...,"Bryan J. Blair,1 Lesley A. Shawler,2 Leif K. A...",The online version contains supplementary mate...,The Analysis of Verbal Behavior (2021) 37:35–5...,Anal Verbal Behav. 2021 Jun; 37(1): 35–56.,The Analysis of Verbal Behavior (2021) 37:35–5...,['The Analysis of Verbal Behavior (2021) 37:35...,TAVB,"References Albright, L., Reeve, K. F., Reeve, ..."
3,The Effects of Competing Verbal Behavior on Pe...,"Michael J. Harman,1 Tiffany Kodak,2 Leah Bohl,...",The purposes of this study were to evaluate th...,The Analysis of Verbal Behavior (2021) 37:57–7...,Anal Verbal Behav. 2021 Jun; 37(1): 57–76.,The Analysis of Verbal Behavior (2021) 37:57–7...,['The Analysis of Verbal Behavior (2021) 37:57...,TAVB,"References Clough, C. W., Meyer, C. S., & Migu..."
4,Naming of Stimuli in Equivalence Class Formati...,Guro Granerud and Erik Arntzen,"In the present study, two typically developing...",The Analysis of Verbal Behavior (2021) 37:77–9...,Anal Verbal Behav. 2021 Jun; 37(1): 77–96.,The Analysis of Verbal Behavior (2021) 37:77–9...,['The Analysis of Verbal Behavior (2021) 37:77...,TAVB,"References Arntzen, E., Granmo, S., & Fields, ..."
...,...,...,...,...,...,...,...,...,...
9333,The midwestern association of behavior analysi...,Margaret E. Peterson,,The Midwestern Association Of Behavior Analysi...,Behav Anal. 1978 Spring; 1(1): 3–15.,The Midwestern Association Of Behavior Analysi...,['The Midwestern Association Of Behavior Analy...,,The Midwestern Association Of Behavior Analysi...
9334,A behavioral approach to the teaching of compo...,Julie S. Vargas,,A Behavioral Approach\n\nto the Teaching of Co...,Behav Anal. 1978 Spring; 1(1): 16–24.,A Behavioral Approach to the Teaching of Compo...,"['A Behavioral Approach', 'to the Teaching of ...",,"References Chase, S. Gobbledygook. Power of wo..."
9335,Theory and technology in behavior analysis 1,Steven C. Hayes,The differences within behaviorism in general ...,Theory and Technology in Behavior Analysis'\n\...,Behav Anal. 1978 Spring; 1(1): 25–33.,Theory and Technology in Behavior Analysis' St...,"[""Theory and Technology in Behavior Analysis'""...",,"References 1. Michael, J. The relev-ance of an..."
9336,Engineering environments for behavioral opport...,Hal Markowitz,,Engineering Environments\n\nfor Behavioral Opp...,Behav Anal. 1978 Spring; 1(1): 34–47.,Engineering Environments for Behavioral Opport...,"['Engineering Environments', 'for Behavioral O...",,"References Bandur'a, M. The effects of an oper..."


## Turn the references col into a list of references

In [41]:
# Regular expression pattern to match years in parentheses
pattern = r'\([1-2][0-9]{3}\)'

for entry in data['references']:
    try:
        # Find all matches for the pattern in the current entry
        years = [match for match in re.finditer(pattern, entry)]
    except:
        print(f"Error processing entry: {entry}")

In [50]:
test_str = references[1]

'. Diagnostic and statistical manual of mental disorders (5th ed.). Arlington, VA: American Psychiatric Publishing. Arntzen, E. (2006)'

In [52]:
!pip install spacy
!python -m spacy download en_core_web_sm

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [51]:
# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Example reference entry
text = test_str

# Process the text with spaCy
doc = nlp(text)

# Iterate through tokens and find the position of the year
year_pos = -1
for token in doc:
    if token.text.isdigit() and len(token.text) == 4:
        year_pos = token.i
        break

# If a year is found, walk back and collect proper nouns
if year_pos != -1:
    authors = []
    for token in doc[:year_pos][::-1]:  # Reverse iterate till the year
        if token.pos_ == "PROPN":  # Proper noun
            authors.insert(0, token.text)  # Insert at beginning to maintain order
        else:
            # Stop if a non-proper noun is encountered
            break

    authors_text = ' '.join(authors)
    print("Authors:", authors_text)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [48]:
entry = data['references'].iloc[72]
years = [match for match in re.finditer(pattern, entry)]
references = []
start = 0
for year in years:
    end = year.end()
    references.append(entry[start:end].strip())
    start = end

# Handle the last reference
if start < len(entry):
    references.append(entry[start:].strip())
references

['References American Psychiatric Association. (2013)',
 '. Diagnostic and statistical manual of mental disorders (5th ed.). Arlington, VA: American Psychiatric Publishing. Arntzen, E. (2006)',
 '. Delayed matching to sample: Probability of responding in accord with equivalence as a function of different delays. The Psychological Record, 56, 135–167. https://doi.org/10.1007/BF0339554 Barnes-Holmes, D., & Barnes-Holmes, Y. (2000)',
 '. Explaining complex behavior: Two perspectives on the concept of generalized operant classes. The Psychological Record, 50, 251–265. https://doi.org/10.1007/ BF03395355 Barros, R. D. S., Galvão, O. D. F., & McIlvane, W. J. (2002)',
 '. Generalized identity matching-to-sample in Cebus apella. The Psychological Record, 52, 441–460. https://doi.org/10.1007/BF03395197 Belisle, J., Dixon, M. R., Stanley, C., Munoz, B., & Daar, J. H. (2016)',
 '. Teaching foundational perspectivetaking skills to children with autism using the PEAK-T curriculum: Single-reversal “

In [44]:
data['references'].iloc[72]

"References American Psychiatric Association. (2013). Diagnostic and statistical manual of mental disorders (5th ed.). Arlington, VA: American Psychiatric Publishing. Arntzen, E. (2006). Delayed matching to sample: Probability of responding in accord with equivalence as a function of different delays. The Psychological Record, 56, 135–167. https://doi.org/10.1007/BF0339554 Barnes-Holmes, D., & Barnes-Holmes, Y. (2000). Explaining complex behavior: Two perspectives on the concept of generalized operant classes. The Psychological Record, 50, 251–265. https://doi.org/10.1007/ BF03395355 Barros, R. D. S., Galvão, O. D. F., & McIlvane, W. J. (2002). Generalized identity matching-to-sample in Cebus apella. The Psychological Record, 52, 441–460. https://doi.org/10.1007/BF03395197 Belisle, J., Dixon, M. R., Stanley, C., Munoz, B., & Daar, J. H. (2016). Teaching foundational perspectivetaking skills to children with autism using the PEAK-T curriculum: Single-reversal “I-You” deictic frames. Jou

In [None]:
# Add section cols
data['introduction'] = section_isolator(
    processing_col = data['processed'], 
    section_start_name='abstract', 
    next_section_name='method'
)

data['methods'] = section_isolator(
    processing_col = data['processed'], 
    section_start_name='method', 
    next_section_name='result'
)

data['results'] = section_isolator(
    processing_col = data['processed'], 
    section_start_name='result', 
    next_section_name='discussion'
)

data['discussion'] = section_isolator(
    processing_col = data['processed'], 
    section_start_name='discussion', 
    next_section_name='references'
)

In [None]:
data.sample(5)