# JSTOR data parser

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
from collections import Counter

In [None]:
from tqdm.auto import tqdm

In [None]:
tqdm.pandas()

In [None]:
from pandas.api.types import is_list_like

In [None]:
from itertools import combinations

In [None]:
import ast

## Explore file with unix tools

In [None]:
!ls ..

In [None]:
!head -n 1 ../jstor_metadata_2025-07-30.jsonl

## Load into a pandas dataframe (chunks)

In [None]:
use_cols = ["title", "languages", "discipline_names", "content_type", "published_date", "url"]
dfs = []
for chunk in tqdm(pd.read_json("../jstor_metadata_2025-07-30.jsonl", lines=True, chunksize=100_000)):
    sub = chunk[use_cols]                                # drop other columns early
    sub = sub[sub["published_date"].notna()]             # drop null dates
    #print(sub['published_date'].str[:4])
    years = sub['published_date'].str[:4].astype(int)  # extract year as int
    sub = sub.assign(year=years)
    sub = sub[sub["year"] > 1949]                        # keep only what you need
    dfs.append(sub.drop(columns=["published_date"]))
df = pd.concat(dfs, ignore_index=True)

In [None]:
# Concatenate all chunks into one big DataFrame
#df = pd.concat(chunks, ignore_index=True)
print("DataFrame loaded with shape:", df.shape)
print(df.info())

In [None]:
#df['title'] = df['title'].astype(str)

In [None]:
df.head()

## One-percent sample

In [None]:
#sample_df = pd.read_csv('sample.csv')

In [None]:
sample_df = df.sample(frac=0.01, random_state=42)

In [None]:
sample_df.to_csv('sample_lim_cols.csv')

In [None]:
sample_df.shape

In [None]:
sample_df.head()

## Lineplots for words

In [None]:
def plot_title_keyword_frequencies(df, substrings):
    
    # Drop rows with missing year or missing title
    df = df.dropna(subset=['year', 'title'])
    
    # Count total number of titles per year
    df['title'] = df['title'].astype(str)
    df['token_count'] = df['title'].apply(lambda x: len(x.split()))
    
    freq_data = {}

    for substring in substrings:
        # Count occurrences of substring (case-insensitive)
        df[substring + '_count'] = df['title'].str.lower().str.count(substring.lower())
        
        # Aggregate per year
        yearly = df.groupby('year').agg({
            substring + '_count': 'sum',
            'token_count': 'sum'
        })
        
        # Calculate relative frequency
        yearly[substring + '_freq'] = yearly[substring + '_count'] / yearly['token_count']
        freq_data[substring] = yearly[substring + '_freq']

    # Combine all frequencies into one DataFrame for plotting
    freq_df = pd.DataFrame(freq_data)

    # Plot
    freq_df.plot(kind='line', figsize=(10, 6))
    plt.title("Relative Frequency of Strings in Title by Year")
    plt.xlabel("Year")
    plt.ylabel(f"Relative Frequency of {', '.join(substrings)} in Title")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'lineplot_{'_'.join(substrings)}.png', dpi=300)
    plt.show()


In [None]:
plot_title_keyword_frequencies(df, substrings=['dataset', 'manuscript'])

In [None]:
#plot_title_keyword_frequencies(df_after_1949, substrings=['digital', 'computational'])

In [None]:
#plot_title_keyword_frequencies(df_after_1949, substrings=['database', 'programming', 'computer', 'server'])

In [None]:
#sample_df.head().to_csv('sample_top_5.csv')

## Year distribution

In [None]:
#df_after_1949["year"] = df_after_1949["year"].astype(int)

# Plot histogram
plt.figure(figsize=(8,5))
plt.hist(df["year"], bins=range(df["year"].min(), df["year"].max() + 2), edgecolor="black")
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Distribution of Years")
plt.show()

## Disciplines

In [None]:
# Example dataframe
example_df = pd.DataFrame({
    "year": [2000, 2001, 2002],
    "discipline_names": [
        ["Religion", "Jewish Studies"],
        ["History"],
        ["Religion", "Philosophy"]
    ]
})

# Explode the lists into separate rows
df_exploded = example_df.explode("discipline_names")

# Count frequencies
discipline_counts = df_exploded["discipline_names"].value_counts()

# Plot
plt.figure(figsize=(8,5))
discipline_counts.plot(kind="bar", edgecolor="black")
plt.xlabel("Discipline")
plt.ylabel("Frequency")
plt.title("Frequency of Disciplines in DataFrame")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Assume df exists, with a column 'discipline_names' that holds lists (or NaN)

counts = Counter()

# Iterate with progress bar

for v in tqdm(df["discipline_names"].values, 
              total=len(df), desc="Counting disciplines"):
    # Treat lists/tuples/ndarrays specially (but not strings)
    if is_list_like(v) and not isinstance(v, (str, bytes)):
        # If you want unique per row, use: iterable = set(v)
        iterable = v
        for x in iterable:
            if not pd.isna(x):
                counts[x] += 1
    else:
        # Scalar path
        if not pd.isna(v):
            counts[v] += 1

discipline_counts = pd.Series(counts, dtype="int64").sort_values(ascending=False)

# (Optional) plot top-N
import matplotlib.pyplot as plt
topN = 85
plt.figure(figsize=(12,6))
discipline_counts.head(topN).plot(kind="bar", edgecolor="black")
plt.xlabel("Discipline")
plt.ylabel("Frequency")
plt.title(f"Discipline frequencies (top {topN})")
plt.xticks(rotation=60, ha="right")
plt.tight_layout()
plt.savefig(f'top_{topN}_disciplines.png', dpi=300)
plt.show()

In [None]:
#plt.savefig(f'top_{topN}_disciplines_post_1949.png', dpi=300)

In [None]:
len(counts)

In [None]:
counts

In [None]:
counts['Language & Literature']

In [None]:
discipline_counts.shape

In [None]:
#discipline_counts.to_excel('disciplines_frequencies_post_1949.xlsx', header=False)

## Make an english-only Dataframe

In [None]:
df_eng_only = df[df['languages'].progress_apply(lambda langs: isinstance(langs, list) and 'eng' in langs)]

In [None]:
df_eng_only.shape

## Discipline groups comparison (2025-11-16)

In [None]:
# Humanities set
list_hum = [
    "Language & Literature", "History", "Religion", "Philosophy", "Art & Art History",
    "Classical Studies", "Archaeology", "Music", "Film Studies", "Performing Arts",
    "Folklore", "Museum Studies", "Cultural Studies",
]
hum_set = set(list_hum)

In [None]:
def normalize_disciplines(col):
    """Ensure each row is a list (empty list if null/scalar)."""
    def to_list(x):
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        return [x]
    return col.apply(to_list)

def make_subsets(df):
    dn = normalize_disciplines(df["discipline_names"])

    # Subset One: contains "linguistics" anywhere (case-insensitive)
    subset1 = df[dn.apply(lambda xs: any(str(x).lower() == "linguistics" for x in xs))]

    # Subset Two: non-empty and all disciplines within hum_set (no extras)
    subset2 = df[dn.apply(lambda xs: len(xs) > 0 and set(xs).issubset(hum_set))]


    return subset1, subset2

def rel_freq_by_year(df, substring, text_col="title", year_col="year"):
    """Relative frequency of substring per year in the chosen text column."""
    mask = df[text_col].fillna("").str.contains(substring, case=False, regex=True)
    stats = (
        df.assign(match=mask)
          .groupby(year_col, observed=True)
          .agg(total=("match", "size"), matches=("match", "sum"))
          .sort_index()
    )
    stats["rel_freq"] = stats["matches"] / stats["total"]
    return stats["rel_freq"]

In [None]:
def plot_substring_by_subsets(subset1, subset2, substring, text_col="title", year_col="year"):
    freq1 = rel_freq_by_year(subset1, substring, text_col=text_col, year_col=year_col)
    freq2 = rel_freq_by_year(subset2, substring, text_col=text_col, year_col=year_col)

    plt.figure(figsize=(8, 4))
    plt.plot(freq1.index, freq1.values, label="Subset One: has linguistics")
    plt.plot(freq2.index, freq2.values, label="Subset Two: only humanities set")
    plt.xlabel("Year")
    plt.ylabel(f'Relative frequency of "{substring}" in {text_col}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'pics/disciplines_compared/query_{substring}_all_ling_vs_hum_only.png', dpi=300)
    plt.show()

In [None]:
subset1, subset2 = make_subsets(df_eng_only)

In [None]:
subset1.shape, subset2.shape

In [None]:
subset_ling = subset1
subset_hum_only = subset2

In [None]:
subset2.sample(100)

In [None]:
plot_substring_by_subsets(subset1=subset1, subset2=subset2, substring="programming")

In [None]:
list_of_substrings = [
    'programming',
    '\\bAI\\b',
    'Artificial intelligence',
    'Character recognition',
    'ChatGPT',
    'Clustering analysis',
    'Comput.+?\\b',
    'Computational',
    'Digital',
    'Distant reading',
    'Entity Recognition',
    'GenAI',
    'Generative AI',
    'Generative Artificial intelligence',
    'Humanities Computing',
    'Large language model',
    '\\bLLM\\b',
    'Literary Computing',
    'Machine learning',
    'Natural language processing',
    '\\bNLP\\b',
    'Named Entity Recognition',
    'Network analysis',
    '\\bOCR\\b',
    'Pattern recognition']

    #     '\\bNER\\b',

In [None]:
for substring in tqdm(list_of_substrings[3:]):
    plot_substring_by_subsets(subset1=subset_ling, 
                              subset2=subset_hum_only, substring=substring)

In [None]:
plot_substring_by_subsets(subset1=subset1, subset2=subset2, substring="\\bGenAI\\b")

In [None]:
plot_substring_by_subsets(subset1=subset1, subset2=subset2, substring="Digital Humanities")

### Some results verification

In [None]:
def save_titles(df, pattern, outfile, text_col="title", case=False, regex=True):
    """
    pattern: substring or regex (raw string for \b, etc.)
    outfile: path to write the titles (txt, one per line)
    """
    matches = df[text_col].fillna("").str.contains(pattern, case=case, regex=regex, na=False)
    df.loc[matches, text_col].to_csv(outfile, index=False, header=False)

In [None]:
def save_titles_with_year(
    df: pd.DataFrame,
    pattern,
    outfile,
    text_col: str = "title",
    year_col: str = "year",
    case: bool = False,
    regex: bool = True,
    sep: str = "\t",
    include_header: bool = False,
    missing_year_label: str = "Unknown"
):
    """
    Write titles and years that match `pattern` to `outfile`.

    pattern: substring or regex (raw string for \b, etc.)
    outfile: path to write the data (txt/tsv, one row per line)
    """
    for col in (text_col, year_col):
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in dataframe.")

    matches = df[text_col].fillna("").str.contains(
        pattern, case=case, regex=regex, na=False
    )
    subset = df.loc[matches, [text_col, year_col]].copy()
    subset[text_col] = subset[text_col].fillna("")
    subset[year_col] = subset[year_col].fillna(missing_year_label)

    subset.to_csv(outfile, sep=sep, index=False, header=include_header)

In [None]:
def save_titles_years_disciplines(
    df: pd.DataFrame,
    pattern,
    outfile,
    text_col: str = "title",
    year_col: str = "year",
    discipline_col: str = "discipline_names",
    url_col: str = "url",
    case: bool = False,
    regex: bool = True,
    sep: str = "\t",
    include_header: bool = False,
    missing_year_label: str = "Unknown",
    missing_discipline_label: str = "Unknown",
    newline_replacement: str = " "
    
):
    """
    Write title/year/discipline rows matching `pattern` to `outfile`.

    pattern: substring or regex (raw string for \b, etc.)
    outfile: path to write the data (txt/tsv, one row per line)
    """
    for col in (text_col, year_col, discipline_col, url_col):
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in dataframe.")

    matches = df[text_col].fillna("").str.contains(
        pattern, case=case, regex=regex, na=False
    )
    subset = df.loc[matches, [text_col, year_col, discipline_col, url_col]].copy()
    subset[text_col] = (
        subset[text_col]
        .fillna("")
        .str.replace(r"[\r\n]+", newline_replacement, regex=True)
        .str.strip()
    )
    subset[year_col] = subset[year_col].fillna(missing_year_label)
    subset[discipline_col] = subset[discipline_col].fillna(missing_discipline_label)

    subset.to_csv(outfile, sep=sep, index=False, header=include_header)

In [None]:
# Examples:
# substring search (case-insensitive)git 
save_titles(subset_hum_only, "Artificial intelligence", "titles_Artificial_intelligence_humanities.txt", case=False, regex=True)

In [None]:
for substring in tqdm(list_of_substrings):
    save_titles(subset_hum_only, substring, 
                f'data_output/disciplines_compared/titles_humanities_{substring.replace(" ","_")}.txt', 
                case=False, 
                regex=True)

In [None]:
for substring in tqdm(list_of_substrings):
    save_titles_with_year(subset_ling, substring, 
                f'data_output/disciplines_compared/titles_linguistics_{substring.replace(" ","_")}.txt', 
                case=False, 
                regex=True)

In [None]:
subset_hum_only.head()

## 2025-12-23 Save titles updated 

In [None]:
subset_hum_only_1990_2024.shape

In [None]:
for substring in tqdm(list_of_substrings):
    save_titles_years_disciplines(subset_hum_only_1990_2024, substring, 
                f'data_output/disciplines_compared_eng_only_1990_2024/titles_humanities_{substring.replace(" ","_")}.tsv', 
                case=False,
                include_header=True, 
                regex=True)

In [None]:
for substring in tqdm(list_of_substrings):
    save_titles_years_disciplines(subset_ling_1990_2024, substring, 
                f'data_output/disciplines_compared_eng_only_1990_2024/titles_linguistics_{substring.replace(" ","_")}.tsv', 
                case=False, 
                include_header=True, 
                regex=True)

In [None]:
substring = 'Pattern recognition'
save_titles_years_disciplines(subset_hum_only_1990_2024, substring, 
                f'data_output/disciplines_compared_eng_only_1990_2024/titles_humanities_{substring.replace(" ","_")}.csv', 
                case=False, 
                regex=True)

## For DH 2026 

In [None]:
subset_hum_only_1990 = subset_hum_only[subset_hum_only['year'] >= 1990]

In [None]:
subset_hum_only_1990.shape

In [None]:
plot_title_keyword_frequencies(subset_hum_only_1990, substrings=['Digital Humanities', 'Cultural Heritage', 'Humanities Computing', 'Artificial Intelligence', 'Generative AI'])

2025-12-10 UPD: without 2025

In [None]:
subset_hum_only_1990_2024 = subset_hum_only[subset_hum_only["year"].between(1990, 2024)]

In [None]:
subset_ling_1990_2024 = subset_ling[subset_ling["year"].between(1990, 2024)]

In [None]:
plot_title_keyword_frequencies(subset_hum_only_1990_2024, substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing',
                                                                'Artificial Intelligence', 
                                                                'Generative AI'])

In [None]:
def return_keyword_frequencies(df, substrings):
    
    # Drop rows with missing year or missing title
    df = df.dropna(subset=['year', 'title'])
    
    # Count total number of titles per year
    df['title'] = df['title'].astype(str)
    df['token_count'] = df['title'].apply(lambda x: len(x.split()))
    
    freq_data = {}

    for substring in substrings:
        # Count occurrences of substring (case-insensitive)
        df[substring + '_count'] = df['title'].str.lower().str.count(substring.lower())
        
        # Aggregate per year
        yearly = df.groupby('year').agg({
            substring + '_count': 'sum',
            'token_count': 'sum'
        })
        
        # Calculate relative frequency
        yearly[substring + '_freq'] = yearly[substring + '_count'] / yearly['token_count']
        freq_data[substring] = yearly[substring + '_freq']

    # Combine all frequencies into one DataFrame for plotting
    freq_df = pd.DataFrame(freq_data)
    return freq_df

In [None]:
def return_raw_keyword_frequencies(df, substrings):
    # Drop rows with missing year or missing title
    df = df.dropna(subset=['year', 'title']).copy()
    df['title'] = df['title'].astype(str)
    df['token_count'] = df['title'].str.strip().str.split().apply(len)

    # Start with yearly token totals so theyâ€™re available in the final DataFrame
    yearly = df.groupby('year')['token_count'].sum().to_frame('total token_count')

    for substring in substrings:
        count_col = f'{substring}_count'
        df[count_col] = df['title'].str.lower().str.count(substring.lower())
        yearly[count_col] = df.groupby('year')[count_col].sum()

    return yearly


In [None]:
freqs_hum = return_keyword_frequencies(subset_hum_only_1990_2024, substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing',
                                                                'Artificial Intelligence', 
                                                                'Generative AI'])

In [None]:
freqs_hum.T.head()

In [None]:
percentages_yearly_hum = freqs_hum.T

In [None]:
percentages_yearly_hum * 100

In [None]:
percentages_yearly_hum.to_csv('data_output/keyword_percentages_pure_humanities_1990_2024.csv')

In [None]:
raw_counts_hum = return_raw_keyword_frequencies((subset_hum_only_1990_2024), substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing',
                                                                'Artificial Intelligence', 
                                                                'Generative AI'])

In [None]:
raw_counts_hum.head()

In [None]:
raw_counts_hum.T.head()

In [None]:
raw_counts_hum.T.to_csv('data_output/keyword_raw_counts_pure_humanities_1990_2024.csv')

/ end of upd

In [None]:
for substring in tqdm(['Digital Humanities', 'Cultural Heritage', 'Humanities Computing', 'Artificial Intelligence', 'Generative AI']):
    plot_title_keyword_frequencies(subset_hum_only_1990, substrings=[substring])

## 2025-11-22 add words lookup

In [None]:
for substring in tqdm(['Digital Humanities', 'Cultural Heritage', 'Humanities Computing', 'Artificial Intelligence', 'Generative AI']):
    save_titles_years_disciplines(subset_hum_only_1990, substring, 
                f'output_for_dh_2026/Pure_Humanities_1990_2024/titles_humanities_{substring.replace(" ","_")}.tsv', 
                case=False, 
                regex=True)

In [None]:
df.shape

In [None]:
for substring in tqdm(['Humanities Computing']): #['Digital Humanities', 'Cultural Heritage', 'Humanities Computing', 'Artificial Intelligence', 'Generative AI']):
    save_titles_years_disciplines(df, substring, 
                f'output_for_dh_2026/All_10_mln/titles_all_{substring.replace(" ","_")}.tsv', 
                case=False, 
                regex=True)

### 2025-12-06 try extended humanities

In [None]:
## UPDATED make_subsets to take any item with at least one humanities discipline,
#  even if it also lists other disciplines.

def make_subset(df):
    dn = normalize_disciplines(df["discipline_names"])

    subset = df[dn.apply(lambda xs: any(x in hum_set for x in xs))]

    return subset

In [None]:
subset_hum_ext = make_subset(df_eng_only)

In [None]:
subset_hum_ext.shape

In [None]:
subset_hum_ext_1990_2024 = subset_hum_ext[subset_hum_ext["year"].between(1990, 2024)]

In [None]:
subset_hum_ext_1990_2024.shape

In [None]:
plot_title_keyword_frequencies(subset_hum_ext_1990_2024, substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing', 
                                                                 'Artificial Intelligence', 
                                                                 'Generative AI'])

In [None]:
freqs_hum_ext_90_24 = return_keyword_frequencies((subset_hum_ext_1990_2024), substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing', 
                                                                 'Artificial Intelligence', 
                                                                 'Generative AI'])

In [None]:
freqs_hum_ext_90_24.T.head()

In [None]:
percentages_hum_ext = freqs_hum_ext_90_24.T * 100

In [None]:
percentages_hum_ext.head()

In [None]:
percentages_hum_ext.to_csv('data_output/keyword_percentages_extended_humanities_1990_2024.csv')

In [None]:
raw_counts_extended = return_raw_keyword_frequencies((subset_hum_ext_1990_2024), substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing',
                                                                'Artificial Intelligence', 
                                                                'Generative AI'])

In [None]:
raw_counts_extended.T.to_csv('data_output/keyword_raw_counts_extended_humanities_1990_2024.csv')

widest: take all English stuff in all disciplinse betweeen 1990 and 2024 

In [None]:
df_eng_only_1990_2024 = df_eng_only[df_eng_only["year"].between(1990, 2024)]

In [None]:
df_eng_only_1990_2024.shape

In [None]:
plot_title_keyword_frequencies(df_eng_only_1990_2024, substrings=['Digital Humanities', 
                                                                 'Cultural Heritage', 
                                                                 'Humanities Computing', 
                                                                 'Artificial Intelligence', 
                                                                 'Generative AI'])

### Try all

## Languages

In [None]:
df["languages"][:5]

In [None]:
# Assume df exists, with a column 'discipline_names' that holds lists (or NaN)

lang_counts = Counter()

# Iterate with progress bar

for v in tqdm(df["languages"].values, 
              total=len(df)):
    # Treat lists/tuples/ndarrays specially (but not strings)
    if is_list_like(v) and not isinstance(v, (str, bytes)):
        # If you want unique per row, use: iterable = set(v)
        iterable = v
        for x in iterable:
            if not pd.isna(x):
                lang_counts[x] += 1
    else:
        # Scalar path
        if not pd.isna(v):
            lang_counts[v] += 1

lang_counts = pd.Series(lang_counts, dtype="int64").sort_values(ascending=False)


In [None]:
lang_counts

In [None]:

# (Optional) plot top-N
import matplotlib.pyplot as plt
topN = 50
plt.figure(figsize=(12,6))
lang_counts.head(topN).plot(kind="bar", edgecolor="black")
plt.xlabel("Discipline")
plt.ylabel("Frequency")
plt.title(f"Discipline frequencies (top {topN})")
plt.xticks(rotation=60, ha="right")
plt.tight_layout()
plt.savefig(f'top_{topN}_langs_post1949.png', dpi=300)
plt.show()

In [None]:
import matplotlib.pyplot as plt
topN = 30
plt.figure(figsize=(12,6))
lang_counts.head(topN).plot(kind="bar", edgecolor="black")
plt.xlabel("Language")
plt.ylabel("Frequency")
plt.title(f"Language frequencies (top {topN})")
plt.xticks(rotation=60, ha="right")
plt.tight_layout()
plt.savefig(f'top_{topN}_langs_post1949.png', dpi=300)
plt.show()

In [None]:
lang_counts[:10]

In [None]:
lang_counts[:10] / lang_counts.sum()

In [None]:
6.226694 * (10**-2)

In [None]:
lang_counts.to_excel('languages_frequencies_post1949.xlsx', header=False)

## Content types

In [None]:
df_after_1949.columns

In [None]:
sample_df.info()

In [None]:
sample_df['content_type'].value_counts()

In [None]:
# # Count frequencies
content_type_counts = df_after_1949["content_type"].value_counts()

# Plot
plt.figure(figsize=(8,5))
content_type_counts.plot(kind="bar", edgecolor="black")
plt.xlabel("Discipline")
plt.ylabel("Frequency")
plt.title("Frequency of Disciplines in DataFrame")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('content_types.png')
plt.show()

In [None]:
content_type_counts 

In [None]:
content_type_counts / content_type_counts.sum()

# October 2025 work package

* Discipline category: calculate trends; co-occurrences.
* Year category: our study will focus on the years 1949-2024
* Frequencies of disciplines per year and per language. Are disciplines appear also in other languages than Eng?
* Nitza and Vered will create a list of search queries for the titles. Daniil will then review it. For this task, it would be better if we have txt/csv files with list of titles per discipline with indication of year. If possible, also csv per language. Is this possible? 
* DOI: Daniil will remove duplicates and re do the frequency graphs.
* Daniil, Nitza and I looked at our data from last year when we started this research.  Our manual advanced queries in JSTOR showed exactly the same steep decline of publications from 2018 (pick year) and on.

Thank you both
Vered

### DOI deduplication

In [None]:
#for col in df.columns:
#    df[col] = None
#del df
#gc.collect()

In [None]:
df_after_1949.shape[0]

In [None]:
df_after_1949['ithaka_doi'].count()

In [None]:
df_after_1949['ithaka_doi'].nunique()

In [None]:
def _parse_list(x):
    """Return a Python list from a cell that may be a list or a stringified list."""
    if isinstance(x, list):
        items = x
    elif pd.isna(x):
        return []
    else:
        # Safely parse strings like "['A', 'B']" or '["A","B"]'
        items = ast.literal_eval(str(x))
    # Normalize: strip whitespace, drop empties, dedupe within a row
    cleaned = {str(i).strip() for i in items if str(i).strip()}
    return list(cleaned)

def cooccurrence_from_df(df, col="discipline_names"):
    counter = Counter()
    for items in df[col].map(_parse_list):
        if len(items) >= 2:
            # Sort to enforce undirected canonical order (A,B) with A<=B
            for a, b in combinations(sorted(items), 2):
                counter[(a, b)] += 1

    # Build edge list DataFrame
    edges = pd.DataFrame(
        [(a, b, w) for (a, b), w in counter.items()],
        columns=["Source", "Target", "Weight"]
    ).sort_values(["Source", "Target"], ignore_index=True)

    return edges


In [None]:
edges = cooccurrence_from_df(sample_df)

In [None]:
edges

In [None]:
edges.to_csv('edges_sample.csv')

In [None]:
edges_huge = cooccurrence_from_df(df_after_1949)

In [None]:
edges_huge.to_csv('discipline_cooc.csv', index=False)

In [None]:
edges_df_filtered = edges_huge[edges_huge["Weight"] >= 1000].copy()
edges_df_filtered.to_csv("discipline_cooccurrence_ge1000.csv", index=False)

## Non-chunked crashes for memory shortage

In [None]:
#df = pd.read_json("jstor_metadata_2025-07-30.jsonl", lines=True)

In [None]:
#print(df.info())
#print(df.head())