## Imports

In [1]:
import pickle
import requests
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import time

## Part 1: Web-scraping

## Part 2: Ready Made vs Custom Made Data

## Part 3: Gathering Research Articles using the OpenAlex API

In [None]:
# Load dataset
with open("datav2.pkl", "rb") as f:
    df = pickle.load(f)

df = df[(df['works_count'] > 5) & (df['works_count'] < 5000)]  # Filtering

papers = pd.DataFrame(columns=['id', 'publication_year', 'cited_by_count', 'author_ids'])
abstracts = pd.DataFrame(columns=['id', 'title', 'abstract_inverted_index'])

# Define concept IDs
concept_ids = [
    "C144024400",  # Sociology
    "C15744967",   # Psychology
    "C162324750",  # Economics
    "C17744445",   # Political Science
    "C33923547",   # Mathematics
    "C121332964",  # Physics
    "C41008148",   # Computer Science
]

# Initialize DataFrames
paperdata = []
abstractdata = []

def get_data(i):
    ids = [aut.split("id:")[1] for aut in i]
    BASE_URL = (
        f"https://api.openalex.org/works?filter=author.id:({("|").join(ids)}),cited_by_count:>10,"
        f"authors_count:<10,concept.id:({'|'.join(concept_ids[:4])}),concept.id:({'|'.join(concept_ids[4:])})"
    )

    retries = 0
    papers = []
    abstracts = []

    while retries < 3:
        try:
            response = requests.get(BASE_URL + "&per-page=200&cursor=*").json()

            while response.get("results"):
                for result in response["results"]:
                    papers.append({
                        "id": result.get("id"),
                        "publication_year": result.get("publication_year"),
                        "cited_by_count": result.get("cited_by_count"),
                        "author_ids": [
                            auth["author"]["id"]
                            for auth in result.get("authorships", [])
                            if "author" in auth and "id" in auth["author"]
                        ],
                    })
                    abstracts.append({
                        "id": result.get("id"),
                        "title": result.get("title"),
                        "abstract_inverted_index": result.get("abstract_inverted_index"),
                    })

                next_cursor = response.get("meta", {}).get("next_cursor")
                if not next_cursor:
                    break

                time.sleep(1) 
                response = requests.get(BASE_URL + f"&per-page=200&cursor={next_cursor}").json()

            return papers, abstracts

        except Exception as e:
            print(f"Error fetching work ID {ids}: {e}")
            retries += 1
            time.sleep(1)

    return [], []

# Parallel processing
num_batch = 5
batch_size = 100 

for i in tqdm(range(0, len(df["works_api_url"]), batch_size)):
    batch_indexes = df["works_api_url"][i:i+100].tolist()
    batches = [batch_indexes[i:i+25] for i in range(0,100,25)]
    # Fetch data in parallel
    results = Parallel(n_jobs=num_batch)(
        delayed(get_data)(batch) for batch in batches
    )

    # Collect results
    for pap, abs in results:
        if pap and abs:
            paperdata.extend(pap)  # Use list extend for efficiency
            abstractdata.extend(abs)

    time.sleep(2)

# Convert collected lists into DataFrames
paperdata_df = pd.DataFrame(paperdata)
abstractdata_df = pd.DataFrame(abstractdata)

# Save results
paperdata_df.to_csv("papers.csv", index=False)
abstractdata_df.to_csv("abstracts.csv", index=False)


## Part 4: The Network of Computational Social Scientists