Quick notes about this task 
- There might be an easier way to run the models using ngrams provided by openalex, but I was stuck on it for too long so I decided to revert the abstracts indices into abstracts and use them for classification.
- I assumed that all the articles from american economic review are economics related and all the american political science review articles are political science related. Some articles might be related to both but I did not make the distinction here. 
- I manually checked some of the abstracts and one actually said "no abstract is available for this article ...", I didn't remove it and kept it in the model which means it probably served as noice in the data but this was not very common.  

In [117]:
#import necessary packages for making database
import requests
import pandas as pd
import numpy as np

In [131]:
def create_query(issn, per_page = 50, num_page = 1):
    '''
    Function that takes issn, results per page, and how many pages as parameter for function.

    Parameters:
        issn (str): The ISSN (International Standard Serial Number) of the academic journal.
        results_per_page (int): The number of results (articles) per page to be returned.
        num_pages (int): The number of pages of results to retrieve.

    Returns openalex api query with desired filters
    '''
    url = f"https://api.openalex.org/works?&sort=publication_year:desc&per-page={per_page}&page={num_page}&filter=locations.source.issn:{issn}&select=id,display_name,doi,publication_year,abstract_inverted_index"
    return url

#function to check the number of None for abstract 
def check(api_response):
    '''
    Function that takes the API response as a parameter and checks the number of None in the abstract inverted index.

    Parameters:
        api_response (requests.Response): The API response object obtained from the OpenAlex API.

    Returns:
        int: The number of occurrences of None in the abstract inverted index.
    '''
    check = []
    for i, result in enumerate(api_response.json()["results"]):
        check.append(result["abstract_inverted_index"])
    return(sum([x is None for x in check]))

def create_df(api_response, label_num):
    '''
    Function that takes the API response and a label number as input and returns a Pandas DataFrame.

    Parameters:
        api_response (requests.Response): The API response object obtained from the OpenAlex API.
        label_num (int): The label number to assign to the articles in the DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing information about the articles.

    '''
    title = []
    abstract = []
    year=[]
    id = []
    labels = [label_num]*100
    for i, result in enumerate(api_response.json()["results"]):
        if result["abstract_inverted_index"] is None:
            continue 
        else:   
            title.append(result["display_name"])
            year.append(result["publication_year"])
            abstract.append(result["abstract_inverted_index"])
            id.append(result["id"])
    
    return pd.DataFrame({"Title":title, "id":id, "publication_year": year, "abstract_index":abstract, "label":labels})

def convert_indices_to_abstract(abstractInvertedIndex):
    '''
    Function that takes an abstract inverted index as input and reconstructs the original abstract.

    Parameters:
        abstractInvertedIndex (dict): The abstract inverted index where keys are words and values are lists of indices.

    Returns:
        str: The reconstructed abstract.

    '''

    #Create the word_index list and sort it by index
    word_index = []
    for k, v in abstractInvertedIndex.items():
        for index in v:
            word_index.append([k, index])
    word_index = sorted(word_index, key=lambda x: x[1])

    #Join the words from word_index list with spaces to form the abstract
    abstract = []
    prev_index = None
    for word, index in word_index:
        if index == prev_index:
            abstract[-1] += ' ' + word  # Append the word to the previous word
        else:
            abstract.append(word)  # Add a new word to the abstract
        prev_index = index

    return ' '.join(abstract)

In [132]:
#American Economic Review 
aer_issn = "1944-7981"
aer_query = create_query(aer_issn, 116, 1)
aer_response = requests.get(aer_query)
check(aer_response)


16

In [133]:
aer_df = create_df(aer_response, 0)   

In [134]:
#American political science review 
apsr_issn = "1537-5943"
apsr_query = create_query(apsr_issn, 100, 1)
apsr_response = requests.get(apsr_query)
check(apsr_response)


0

In [135]:
apsr_df = create_df(apsr_response, 1)  

In [136]:
total_df = pd.concat([apsr_df,aer_df], axis = 0)
total_df.index = range(200)
print(len(total_df))
total_df.head()

200


Unnamed: 0,Title,id,publication_year,abstract_index,label
0,Embedding Regression: Models for Context-Speci...,https://openalex.org/W4317434765,2023,"{'Social': [0], 'scientists': [1], 'commonly':...",1
1,Divided We Unite: The Nature of Partyism and t...,https://openalex.org/W4361275997,2023,"{'Highlighting': [0], 'the': [1, 52, 55, 75, 8...",1
2,Global Slavery in the Making of States and Int...,https://openalex.org/W4379986437,2023,"{'Despite': [0], 'having': [1], 'key': [2], 'i...",1
3,Modeling Spatial Heterogeneity and Historical ...,https://openalex.org/W4322769899,2023,"{'A': [0], 'wealth': [1], 'of': [2, 55], 'rece...",1
4,"Which Markets, Whose Rationality? Markets as P...",https://openalex.org/W4316466446,2023,"{'This': [0, 77], 'article': [1, 96], 'explica...",1


In [137]:
new_abstract = []
for i in total_df["abstract_index"]:
    full_abstract = convert_indices_to_abstract(i)
    new_abstract.append(full_abstract)
 

In [138]:
new_df = pd.DataFrame({'abstract':new_abstract, 'Label': total_df['label']})
print(len(new_df))
new_df.head()

200


Unnamed: 0,abstract,Label
0,Social scientists commonly seek to make statem...,1
1,Highlighting the strength of “partyism” in man...,1
2,Despite having key implications for fundamenta...,1
3,A wealth of recent research in comparative pol...,1
4,This article explicates and critiques an under...,1


# Running the Models 
### I ran 4 models each with 3 difference combinations of ngrams, 1 with unigrams only, 1 with unigrams and bigrams, 1 with unigrams and trigrams

In [139]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

X = new_df['abstract']
y = new_df['Label']

# Split data into training and testing sets for final evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2022, stratify=y)

# UNIGRAM 

# Models
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(random_state=2022)),
    ('Random Forest', RandomForestClassifier(random_state=2022)),
    ('Support Vector Machine', SVC(random_state=2022))
]

# Pipelines
pipelines = [
    (name, Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range=(1, 1))),
        ('model', model)
    ])) for name, model in models
]

for name, model in pipelines:
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}, Std Dev = {scores.std():.4f}")



Naive Bayes: Mean Accuracy = 0.9300, Std Dev = 0.0430
Logistic Regression: Mean Accuracy = 0.9700, Std Dev = 0.0100
Random Forest: Mean Accuracy = 0.9850, Std Dev = 0.0200
Support Vector Machine: Mean Accuracy = 0.9000, Std Dev = 0.0632


In [140]:
# UNIGRAM, BIGRAM
# Pipelines
pipelines = [
    (name, Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range=(1, 2))),
        ('model', model)
    ])) for name, model in models
]

for name, model in pipelines:
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}, Std Dev = {scores.std():.4f}")


Naive Bayes: Mean Accuracy = 0.9350, Std Dev = 0.0436
Logistic Regression: Mean Accuracy = 0.9600, Std Dev = 0.0255
Random Forest: Mean Accuracy = 0.9650, Std Dev = 0.0374
Support Vector Machine: Mean Accuracy = 0.7550, Std Dev = 0.0458


In [141]:
# UNIGRAM, TRIGRAM
# Pipelines
pipelines = [
    (name, Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),
        ('model', model)
    ])) for name, model in models
]

for name, model in pipelines:
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}, Std Dev = {scores.std():.4f}")



Naive Bayes: Mean Accuracy = 0.9350, Std Dev = 0.0436
Logistic Regression: Mean Accuracy = 0.9500, Std Dev = 0.0316
Random Forest: Mean Accuracy = 0.9150, Std Dev = 0.0464
Support Vector Machine: Mean Accuracy = 0.6100, Std Dev = 0.0583
