In [72]:
import requests
import pandas as pd
import numpy as np


In [73]:
def create_query(issn, per_page = 50, num_page = 1):
    url = f"https://api.openalex.org/works?&sort=publication_year:desc&per-page={per_page}&page={num_page}&filter=locations.source.issn:{issn}&select=id,display_name,doi,publication_year,abstract_inverted_index"
    return url

#function to check the number of None for abstract 
def check(api_response):
    check = []
    for i, result in enumerate(api_response.json()["results"]):
        check.append(result["abstract_inverted_index"])
    return(sum([x is None for x in check]))

def create_df(api_response, label_num):
    title = []
    abstract = []
    year=[]
    id = []
    labels = [label_num]*50
    for i, result in enumerate(api_response.json()["results"]):
        if result["abstract_inverted_index"] is None:
            continue 
        else:   
            title.append(result["display_name"])
            year.append(result["publication_year"])
            abstract.append(result["abstract_inverted_index"])
            id.append(result["id"])
    
    return pd.DataFrame({"Title":title, "id":id, "publication_year": year, "abstract_index":abstract, "label":labels})

def convert_indices_to_abstract(abstractInvertedIndex):
    # Step 1 and 2: Create the word_index list and sort it by index
    word_index = []
    for k, v in abstractInvertedIndex.items():
        for index in v:
            word_index.append([k, index])
    word_index = sorted(word_index, key=lambda x: x[1])

    # Step 3: Join the words from word_index list with spaces to form the abstract
    abstract = []
    prev_index = None
    for word, index in word_index:
        if index == prev_index:
            abstract[-1] += ' ' + word  # Append the word to the previous word
        else:
            abstract.append(word)  # Add a new word to the abstract
        prev_index = index

    return ' '.join(abstract)

In [76]:
aer_issn = "1944-7981"
aer_query = create_query(aer_issn, 115, 1)
aer_response = requests.get(aer_query)
check(aer_response)


16

In [20]:
aer_df = create_df(aer_response, 0)   

In [21]:
#for American political science review 
apsr_issn = "1537-5943"
apsr_query = create_query(apsr_issn, 50, 1)
apsr_response = requests.get(apsr_query)
check(apsr_response)


0

In [22]:
apsr_df = create_df(apsr_response, 1)   

In [23]:
total_df = pd.concat([apsr_df,aer_df], axis = 0)
total_df = total_df.sample(frac = 1)
total_df.index = range(100)
total_df.head()

Unnamed: 0,Title,id,publication_year,abstract_index,label
0,Electronic Food Vouchers: Evidence from an At-...,https://openalex.org/W4323366278,2023,"{'We': [0], 'compare': [1], 'how': [2], 'in-ki...",0
1,Occupational Exposure to Capital-Embodied Tech...,https://openalex.org/W3129712029,2023,"{'We': [0, 35], 'study': [1], 'differences': [...",0
2,Women Also Know Stuff: Challenging the Gender ...,https://openalex.org/W4383737845,2023,"{'This': [0], 'article': [1], 'proposes': [2],...",1
3,Political Responsiveness to Conflict Victims: ...,https://openalex.org/W4322506576,2023,"{'Violence': [0], 'leaves': [1], 'significant'...",1
4,"Droughts, Deluges, and (River) Diversions: Val...",https://openalex.org/W4323366286,2023,"{'This': [0], 'paper': [1], 'develops': [2], '...",0


In [43]:
new_abstract = []
for i in total_df["abstract_index"]:
    full_abstract = convert_indices_to_abstract(i)
    new_abstract.append(full_abstract)
 

In [53]:
new_df = pd.DataFrame({'abstract':new_abstract, 'Label': total_df['label']})
print(len(new_df))
new_df.head()

100


Unnamed: 0,abstract,Label
0,We compare how in-kind food assistance and an ...,0
1,We study differences in exposure to factor-bia...,0
2,This article proposes a simple but powerful fr...,1
3,Violence leaves significant social groups at a...,1
4,This paper develops and applies a method to va...,0


In [71]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Assuming you have your data stored in a DataFrame new_df with 'abstract' and 'Label' columns

X = new_df['abstract']
y = new_df['Label']

# Split data into training and testing sets for final evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2022, stratify=y)

# Models
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(random_state=2022)),
    ('Random Forest', RandomForestClassifier(random_state=2022)),
    ('Support Vector Machine', SVC(random_state=2022))
]

# Pipelines
pipelines = [
    (name, Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range=(1, 2))),
        ('model', model)
    ])) for name, model in models
]

for name, model in pipelines:
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}, Std Dev = {scores.std():.4f}")



Naive Bayes: Mean Accuracy = 0.9400, Std Dev = 0.0374
Logistic Regression: Mean Accuracy = 0.9300, Std Dev = 0.0510
Random Forest: Mean Accuracy = 0.9200, Std Dev = 0.0678
Support Vector Machine: Mean Accuracy = 0.6800, Std Dev = 0.0600
