# JSTOR Data Processing and Cleaning

This notebook processes and cleans the full JSTOR data title, primarily focused on processing dates, establishing publication lineage, binning data, adding in wiki data, and creating a final dataframe for analysis.

### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
from datetime import datetime
import glob
alt.renderers.enable("mimetype")
alt.data_transformers.enable('default', max_rows=None)
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append("../")
from scripts.utils import *

#### Set Data Directory

In [2]:
set_data_directory_path("../data/public_data/")
data_directory_path = get_data_directory_path()

## JSTOR All Journals

### Load and Clean Data

In [3]:
def get_most_recent_file_by_date(dir_path, file_pattern):
    # Get list of all files matching the pattern
    files = glob.glob(os.path.join(dir_path, file_pattern))
    
    # Function to extract date from filename
    def extract_date(filename):
        date_str = filename.split('_')[-1].replace('.txt', '')
        return datetime.strptime(date_str, '%Y-%m-%d')

    # Find the file with the latest date in its name
    most_recent_file = max(files, key=extract_date)
    
    return most_recent_file

# Use the function to get the most recent file by date in filename
most_recent_file = get_most_recent_file_by_date(f'{data_directory_path}jstor_titles', 'JSTOR_Global_AllArchiveTitles_*.txt')
global_txt_file = pd.read_csv(most_recent_file, sep="\t")

In [4]:
console.print(f"Currently, there are {len(global_txt_file)} titles in the JSTOR Global All Archive Titles list, created on {most_recent_file.split('_')[-1].replace('.txt', '')}.")

#### Trace Lineage and Parent Publications

While there are individual publication titles, there are also parent publications that contain multiple titles. We need to trace the lineage of these publications to ensure that we are not double counting publications and also in case, one wants to use lineage to select titles with the most coverage.

In [5]:
global_txt_file['grouped_id'] = global_txt_file.preceding_publication_title_id.fillna(global_txt_file.title_id)
# Function to trace the lineage of a publication
def trace_lineage(title_id, df):
    preceding_id = df[df['title_id'] == title_id]['preceding_publication_title_id'].iloc[0]
    if pd.isna(preceding_id):
        return title_id
    else:
        return trace_lineage(preceding_id, df)

tqdm.pandas(desc="Tracing Lineage")
# Apply the function to create a new column for lineage
global_txt_file['lineage'] = global_txt_file['title_id'].progress_apply(lambda x: trace_lineage(x, global_txt_file))

# Function to map each publication to its parent
def map_parent(title_id, df):
    parent_id = df[df['title_id'] == title_id]['parent_publication_title_id'].iloc[0]
    return parent_id if not pd.isna(parent_id) else title_id

tqdm.pandas(desc="Mapping Parent")
# Apply the function to create a new column for parent
global_txt_file['parent'] = global_txt_file['title_id'].progress_apply(lambda x: map_parent(x, global_txt_file))

Tracing Lineage:   0%|          | 0/4497 [00:00<?, ?it/s]

Mapping Parent:   0%|          | 0/4497 [00:00<?, ?it/s]

In [6]:
lineage_counts = global_txt_file.lineage.value_counts().reset_index().rename(columns={"index": "lineage", "lineage": "title_count"})

# Calculate total number of publications
total_publications = lineage_counts['title_count'].sum()
subset_publications = len(lineage_counts)

# Create the base chart
chart = alt.Chart(lineage_counts).mark_bar().encode(
    y=alt.Y("title_count:O", title=None),
    x=alt.X("count():Q", title="Number of Publications"),
).properties(
    title = f"Subsetting to lineage means that we go from {total_publications} number of unique publication titles to {subset_publications}",
    width=400,
    height=200
)

# Display the chart
chart.configure_axisY(
    titleAngle=0,
    titleY=-10,
    titleX=-10,
)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [7]:
console.print(f"But only {lineage_counts[lineage_counts['title_count'] > 1].shape[0]} lineages have more than 1 publication, out of {len(lineage_counts)} total lineage publications (so a percentage of {round((lineage_counts[lineage_counts['title_count'] > 1].shape[0] / len(lineage_counts)) * 100, 2)}%). And only {lineage_counts[lineage_counts['title_count'] > 2].shape[0]} lineages have more than 2 publications.")

#### Process Dates 

Need to deal with earlier than 1677 dates

In [8]:
def conv(date):
    year, month, day = map(int, date.split("-"))
    return pd.Period(year=year, month=month, day=day, freq="D")

tqdm.pandas(desc="Processing date_first_issue_online")
global_txt_file["processed_date_first_issue_online"] = global_txt_file.date_first_issue_online.progress_apply(conv)

tqdm.pandas(desc="Processing date_last_issue_online")
global_txt_file["processed_date_last_issue_online"] = global_txt_file.date_last_issue_online.progress_apply(conv)

Processing date_first_issue_online:   0%|          | 0/4497 [00:00<?, ?it/s]

Processing date_last_issue_online:   0%|          | 0/4497 [00:00<?, ?it/s]

In [9]:
# get earliest and latest processed_date_first_issue_online
earliest_date = global_txt_file.processed_date_first_issue_online.min()
latest_date = global_txt_file.processed_date_last_issue_online.max()
console.print(f"Earliest date of a first publication in this list is  {earliest_date}. And the latest date of a first publication is {latest_date}")

#### Infer length of publications

In [10]:
def infer_lineage_length_publication(rows):
    total_active_years = 0

    for _, row in rows.iterrows():
        # Check if both dates are not null
        if pd.notnull(row['processed_date_first_issue_online']) and pd.notnull(row['processed_date_last_issue_online']):
            start_year = row['processed_date_first_issue_online'].year
            end_year = row['processed_date_last_issue_online'].year
            active_years = end_year - start_year + 1  # +1 to include both start and end year
            total_active_years += active_years

    # Apply the total active years to each row in the group
    rows['lineage_active_years'] = total_active_years

    return rows

tqdm.pandas(desc="Infer length of publication by lineage")
global_txt_file = global_txt_file.groupby('lineage').progress_apply(infer_lineage_length_publication)

Infer length of publication by lineage:   0%|          | 0/2851 [00:00<?, ?it/s]

In [11]:
def infer_title_length_publication(rows):
    total_active_years = 0

    for _, row in rows.iterrows():
        # Check if both dates are not null
        if pd.notnull(row['processed_date_first_issue_online']) and pd.notnull(row['processed_date_last_issue_online']):
            start_year = row['processed_date_first_issue_online'].year
            end_year = row['processed_date_last_issue_online'].year
            active_years = end_year - start_year + 1  # +1 to include both start and end year
            total_active_years += active_years

    # Apply the total active years to each row in the group
    rows['title_active_years'] = total_active_years
    return rows

tqdm.pandas(desc="Infer length of publication by title")
global_txt_file = global_txt_file.groupby('title_id').progress_apply(infer_title_length_publication)

Infer length of publication by title:   0%|          | 0/4469 [00:00<?, ?it/s]

We can now calculate the length of time that a publication has been in existence, but need to calculate it for lineage vs individual titles.

In [12]:
# Get longest and shortest lineage
longest_lineage = global_txt_file[global_txt_file['lineage_active_years'] == global_txt_file['lineage_active_years'].max()]
longest_lineage_number_of_publications = global_txt_file[global_txt_file['lineage'] == longest_lineage['lineage'].iloc[0]]
console.print(f"The longest lineage is {longest_lineage['lineage'].iloc[0]} with {longest_lineage['lineage_active_years'].iloc[0]} active years. This lineage is comprised of {longest_lineage_number_of_publications.shape[0]} publications with the titles {', '.join(longest_lineage_number_of_publications['publication_title'].tolist())}.") 
console.print("The overall statistics for the lineage active years are as follows:")
console.print(global_txt_file[['lineage', 'lineage_active_years']].drop_duplicates().lineage_active_years.describe())

In [13]:
# Get longest and shortest title
longest_title = global_txt_file[global_txt_file['title_active_years'] == global_txt_file['title_active_years'].max()]
console.print(f"The longest title, not using lineage, is {longest_title['publication_title'].iloc[0]} with {longest_title['title_active_years'].iloc[0]} active years.")
console.print("The overall statistics for the title active years are as follows:")
console.print(global_txt_file[['publication_title', 'title_active_years']].drop_duplicates().title_active_years.describe())

In [14]:
from scipy import stats
lineage_active_years = global_txt_file[['lineage', 'lineage_active_years']].drop_duplicates()['lineage_active_years'].dropna()
title_active_years = global_txt_file['title_active_years'].dropna()
t_stat, p_value = stats.ttest_ind(lineage_active_years, title_active_years)

console.print(f"T-statistic: {t_stat}")
console.print(f"P-value: {p_value}")

console.print("The p-value is less than 0.05, so we reject the null hypothesis that the means of the two groups are the same. This means that the lineage active years and title active years are significantly different and could have impact on downstream analyses.")

#### Fill Missing Disciplines

In [15]:
def clean_and_count_disciplines(df):
    # Clean discipline names and count per title
    df['discipline'] = df['discipline'].apply(lambda x: [i.strip() for i in x.split('; ')] if isinstance(x, str) else x)
    df['discipline_count_per_title'] = df['discipline'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return df

# Clean disciplines and count per title
global_txt_file = clean_and_count_disciplines(global_txt_file)

In [16]:
def aggregate_disciplines(group):
    # Combine all disciplines into a single list
    all_disciplines = []
    for disciplines in group['discipline'].dropna():
        all_disciplines.extend(disciplines)

    # Remove duplicates and sort
    unique_disciplines = sorted(set(all_disciplines))
    group['discipline_count_per_lineage'] = len(unique_disciplines)
    # Join back into a single string or keep as a list
    aggregated_disciplines = '; '.join(unique_disciplines) if unique_disciplines else None
    
    # Assign the aggregated disciplines to each row in the group
    group['aggregated_disciplines'] = aggregated_disciplines

    return group

tqdm.pandas(desc="Aggregate disciplines")
# Apply the function to each lineage group
global_txt_file = global_txt_file.groupby('lineage').progress_apply(aggregate_disciplines)

Aggregate disciplines:   0%|          | 0/2851 [00:00<?, ?it/s]

In [17]:
global_txt_file['unique_title_discipline'] = global_txt_file.groupby('title_id')['aggregated_disciplines'].transform(lambda x: x.nunique() == 1)
global_txt_file['unique_lineage_discipline'] = global_txt_file.groupby('lineage')['aggregated_disciplines'].transform(lambda x: x.nunique() == 1)

In [24]:
discipline_categories = {
    'Social Sciences': 'Social Sciences',
    'Humanities': 'Humanities',
    'Area Studies': 'Social Sciences',
    'Science & Mathematics': 'STEM',
    'Language & Literature': 'Humanities',
    'History': 'Humanities',
    'Arts': 'Humanities',
    'Business & Economics': 'Social Sciences',
    'Political Science': 'Social Sciences',
    'Business': 'Social Sciences',
    'Biological Sciences': 'STEM',
    'Education': 'Social Sciences',
    'Art & Art History': 'Humanities',
    'Economics': 'Social Sciences',
    'Law': 'Social Sciences',
    'Sociology': 'Social Sciences',
    'Security Studies': 'Social Sciences',
    'Religion': 'Humanities',
    'Medicine & Allied Health': 'STEM',
    'Asian Studies': 'Social Sciences',
    'Sustainability': 'STEM',
    'Archaeology': 'Social Sciences',
    'Philosophy': 'Humanities',
    'Botany & Plant Sciences': 'STEM',
    'International Relations': 'Social Sciences',
    'Anthropology': 'Social Sciences',
    'American Studies': 'Social Sciences',
    'Mathematics': 'STEM',
    'Music': 'Humanities',
    'Ecology & Evolutionary Biology': 'STEM',
    'Peace & Conflict Studies': 'Social Sciences',
    'Classical Studies': 'Humanities',
    'Health Sciences': 'STEM',
    'Public Health': 'Social Sciences',
    'Zoology': 'STEM',
    'Middle East Studies': 'Social Sciences',
    'African Studies': 'Social Sciences',
    'Linguistics': 'Social Sciences',
    'Latin American Studies': 'Social Sciences',
    'Jewish Studies': 'Humanities',
    'General Science': 'STEM',
    'Architecture & Architectural History': 'Humanities',
    'Environmental Science': 'STEM',
    'Irish Studies': 'Humanities',
    'Health Policy': 'Social Sciences',
    'Management & Organizational Behavior': 'Social Sciences',
    'Statistics': 'STEM',
    'Finance': 'Social Sciences',
    'History of Science & Technology': 'STEM',
    'Public Policy & Administration': 'Social Sciences',
    'Geography': 'Social Sciences',
    'Psychology': 'Social Sciences',
    'Population Studies': 'Social Sciences',
    "Feminist & Women's Studies": 'Social Sciences',
    'Development Studies': 'Social Sciences',
    'Military Studies': 'Social Sciences',
    'Performing Arts': 'Humanities',
    'Social Work': 'Social Sciences',
    'Labor & Employment Relations': 'Social Sciences',
    'Bibliography': 'Humanities',
    'Slavic Studies': 'Humanities',
    'Science & Technology Studies': 'STEM',
    'Folklore': 'Humanities',
    'African American Studies': 'Social Sciences',
    'Gender Studies': 'Social Sciences',
    'European Studies': 'Social Sciences',
    'Technology': 'STEM',
    'Film Studies': 'Humanities',
    'Library Science': 'Social Sciences',
    'Agriculture': 'STEM',
    'Aquatic Sciences': 'STEM',
    'Museum Studies': 'Humanities',
    'Urban Studies': 'Social Sciences',
    'Engineering': 'STEM',
    'Environmental Studies': 'STEM',
    'Cultural Studies': 'Social Sciences',
    'Criminology & Criminal Justice': 'Social Sciences',
    'British Studies': 'Humanities',
    'Marketing & Advertising': 'Social Sciences',
    'Paleontology': 'STEM',
    'Communication Studies': 'Social Sciences',
    'Horticulture': 'STEM',
    'Garden & Landscape': 'Humanities',
    'Computer Science': 'STEM',
    'Transportation Studies': 'STEM',
    'American Indian Studies': 'Social Sciences',
    'Developmental & Cell Biology': 'STEM',
    'Geology': 'STEM',
    'Food Studies': 'Social Sciences',
    'Astronomy': 'STEM'
}
exploded_global_txt_file = global_txt_file.explode('discipline')
exploded_global_txt_file['discipline_category'] = exploded_global_txt_file['discipline'].map(discipline_categories)

def aggregate_disciplines_categories(group, aggregate_column):
    # Combine all disciplines into a single list
    all_disciplines = []
    for disciplines in group['discipline_category'].dropna():
        all_disciplines.extend([disciplines])
    # Remove duplicates and sort
    unique_disciplines = sorted(set(all_disciplines))
    group[f'discipline_category_count_per_{aggregate_column}'] = len(unique_disciplines)
    # Join back into a single string or keep as a list
    aggregated_disciplines = '; '.join(unique_disciplines) if unique_disciplines else None
    
    # Assign the aggregated disciplines to each row in the group
    group[f'aggregated_discipline_categories_{aggregate_column}'] = aggregated_disciplines

    return group

tqdm.pandas(desc="Aggregate disciplines")
# # Apply the function to each lineage group
exploded_global_txt_file = exploded_global_txt_file.groupby('lineage').progress_apply(aggregate_disciplines_categories, aggregate_column="lineage")
exploded_global_txt_file = exploded_global_txt_file.groupby('title_id').progress_apply(aggregate_disciplines_categories, aggregate_column="title")

Aggregate disciplines:   0%|          | 0/2851 [00:00<?, ?it/s]

Aggregate disciplines:   0%|          | 0/4469 [00:00<?, ?it/s]

In [32]:
subset_exploded_lineage = exploded_global_txt_file[['lineage', 'discipline_category', 'lineage_active_years']].drop_duplicates()
subset_exploded_title = exploded_global_txt_file[['title_id', 'discipline_category', 'title_active_years']].drop_duplicates()

unique_discipline_categories = exploded_global_txt_file['discipline_category'].dropna().unique().tolist()

for category in unique_discipline_categories:
    console.print(f"Exploring distribution for category {category}")
    lineage_active_years = subset_exploded_lineage[subset_exploded_lineage.discipline_category == category][['lineage', 'lineage_active_years']].drop_duplicates()['lineage_active_years'].dropna()
    title_active_years = subset_exploded_title[subset_exploded_title.discipline_category == category]['title_active_years'].dropna()
    t_stat, p_value = stats.ttest_ind(lineage_active_years, title_active_years)

    console.print(f"T-statistic: {t_stat}")
    console.print(f"P-value: {p_value}")

In [34]:
subset_exploded_lineage = exploded_global_txt_file[['lineage', 'discipline', 'lineage_active_years']].drop_duplicates()
subset_exploded_title = exploded_global_txt_file[['title_id', 'discipline', 'title_active_years']].drop_duplicates()

unique_disciplines = exploded_global_txt_file['discipline'].dropna().unique().tolist()
dfs = []
for discipline in unique_disciplines:
    # console.print(f"Exploring distribution for discipline {discipline}")
    lineage_active_years = subset_exploded_lineage[subset_exploded_lineage.discipline == discipline][['lineage', 'lineage_active_years']].drop_duplicates()['lineage_active_years'].dropna()
    title_active_years = subset_exploded_title[subset_exploded_title.discipline == discipline]['title_active_years'].dropna()
    t_stat, p_value = stats.ttest_ind(lineage_active_years, title_active_years)

    data_dict = {
        "discipline": discipline,
        "t_stat": t_stat,
        "p_value": p_value
    }
    dfs.append(data_dict)

In [35]:
p_values_df = pd.DataFrame(dfs)

In [37]:
p_values_df.sort_values(by="p_value", ascending=True)

Unnamed: 0,discipline,t_stat,p_value
15,Social Sciences,15.053382,2.944239e-49
7,Science & Mathematics,11.419577,1.458738e-28
0,Humanities,10.269758,4.055736e-24
5,History,9.531080,1.353542e-20
12,Area Studies,9.143820,2.268516e-19
...,...,...,...
68,American Indian Studies,0.723132,4.800372e-01
58,Gender Studies,0.606632,5.467863e-01
87,Transportation Studies,0.476307,6.399199e-01
80,Geology,0.469872,6.527323e-01


#### Infer Publishers

In [None]:
def infer_missing_publisher(rows):
    publisher_name = rows['publisher_name'].unique().tolist()
    publisher_name = [x for x in publisher_name if pd.notnull(x)]

    if len(publisher_name) > 1:
        print(f"Multiple publishers found for this publication {publisher_name}")
        publisher_name = ','.join(publisher_name)
    if len(publisher_name) == 1:
        publisher_name = publisher_name[0]
    
    if len(publisher_name) == 0:
        publisher_name = np.nan
    
    rows['processed_publisher_name'] = publisher_name

    return rows

tqdm.pandas(desc="Infer missing publisher")
global_txt_file = global_txt_file.sort_values(by=['processed_date_first_issue_online'])
global_txt_file = global_txt_file.groupby('lineage').progress_apply(infer_missing_publisher)

In [22]:
lineage_global_txt_file = exploded_global_txt_file[['lineage', 'lineage_active_years', 'aggregated_disciplines', 'date_first_issue_online', 'date_last_issue_online', 'unique_lineage_discipline', 'aggregated_discipline_categories']].drop_duplicates()
# get earliest and latest date_first_issue_online for each lineage
lineage_global_txt_file = lineage_global_txt_file.groupby(['lineage',  'lineage_active_years', 'aggregated_disciplines', 'unique_lineage_discipline', 'aggregated_discipline_categories']).agg({'date_first_issue_online': 'min', 'date_last_issue_online': 'max'}).reset_index()
len(lineage_global_txt_file), len(global_txt_file)

(2846, 4497)

#### Bin Publications Based on Inferred Length

###### By Lineage

In [None]:
global_txt_file.groupby(['lineage', 'lineage_active_years', 'discipline'])

In [None]:
global_txt_file['lineage_active_years_bin'] = pd.qcut(global_txt_file['lineage_active_years'], q=4, duplicates='drop', labels=False)
global_txt_file['lineage_active_years_bin'] = "binned_" + global_txt_file['lineage_active_years_bin'].astype(str) 
global_txt_file.lineage_active_years_bin.value_counts()

In [None]:
global_txt_file['coerced_date_first_issue_online'] = pd.to_datetime(global_txt_file['date_first_issue_online'], format='%Y-%m-%d', errors='coerce')
global_txt_file['coerced_date_last_issue_online'] = pd.to_datetime(global_txt_file['date_last_issue_online'], format='%Y-%m-%d', errors='coerce')

In [None]:
lineage_numb_pubs_by_first_issue = alt.Chart(global_txt_file[['title_id', 'coerced_date_first_issue_online', 'lineage_active_years_bin', 'lineage_active_years']]).mark_bar().encode(
    x='coerced_date_first_issue_online:T',
    y='count()',
    color='lineage_active_years_bin:N'
).properties(
    width=300,
    height=150
)

lineage_years_pubs = alt.Chart(global_txt_file[['title_id', 'coerced_date_first_issue_online', 'lineage_active_years_bin', 'lineage_active_years']]).mark_bar().encode(
    x='lineage_active_years:Q',
    y='count()',
    color='lineage_active_years_bin:N'
).properties(
    width=300,
    height=150
)
lineage_years_pubs | lineage_numb_pubs_by_first_issue


###### By Title

In [None]:
global_txt_file['title_active_years_bin'] = pd.qcut(global_txt_file['title_active_years'], q=4, duplicates='drop', labels=False)
global_txt_file['title_active_years_bin'] = "binned_" + global_txt_file['title_active_years_bin'].astype(str) 
global_txt_file.title_active_years_bin.value_counts()

In [None]:
title_numb_pubs_by_first_issue = alt.Chart(global_txt_file[['title_id', 'coerced_date_first_issue_online', 'title_active_years_bin', 'title_active_years']]).mark_bar().encode(
    x='coerced_date_first_issue_online:T',
    y='count()',
    color='title_active_years_bin:N'
).properties(
    width=300,
    height=150
)

title_years_pubs = alt.Chart(global_txt_file[['title_id', 'coerced_date_first_issue_online', 'title_active_years_bin', 'title_active_years']]).mark_bar().encode(
    x='title_active_years:Q',
    y='count()',
    color='title_active_years_bin:N'
).properties(
    width=300,
    height=150
)
title_years_pubs | title_numb_pubs_by_first_issue


In [None]:
exploded_global_txt_file.columns

In [None]:
subset_exploded_global_txt_file = exploded_global_txt_file[['lineage', 'discipline_category', 'discipline_category_count_per_lineage', 'aggregated_discipline_categories', 'aggregated_disciplines', 'lineage_active_years', 'date_first_issue_online', 'date_last_issue_online']].drop_duplicates()
# global_txt_file = global_txt_file.merge(subset_exploded_global_txt_file, on=['lineage', 'aggregated_disciplines'], how='left')

In [None]:
subset_exploded_global_txt_file

In [None]:
# Create a base chart
# .isin(lineage_counts[lineage_counts['title_count'] > 3].lineage.unique())
base = alt.Chart(subset_exploded_global_txt_file).encode(
    y=alt.Y('lineage:N', sort='-x', title='Publication Title'),
    x=alt.X('date_first_issue_online:T', title='Start Date'),
    x2='date_last_issue_online:T',
    color='aggregated_discipline_categories:N'
)

# Create the bars representing the duration of each publication
bars = base.mark_bar().encode(
    color=alt.Color('lineage_active_years:Q', scale=alt.Scale(scheme='blueorange'), legend=None)
)

# Text to show the duration on the bar
# text = base.mark_text(
#     align='left',
#     baseline='middle',
#     dx=3  # Nudges text to right so it doesn't appear on top of the bar
# ).encode(
#     text=alt.Text('lineage_active_years:Q', format='.1f')
# )

# Combine the bars and text
chart = (bars 
        #  + text
         ).properties(
    width=800,
    height=alt.Step(20)  # Controls the space between bars
)

chart


In [None]:
# # Define a function to propagate non-null disciplines within each group
# def fill_missing_disciplines(rows):
#     # If there are non-null disciplines within the rows, use the first one
#     if rows['discipline'].notnull().any():
#         # Fill missing values with the first non-null discipline in the rows
#         rows['title_discipline'] = rows['discipline'].fillna(method='bfill').fillna(method='ffill')
#     return rows

# tqdm.pandas(desc="Fill in Disciplines")
# global_txt_file = global_txt_file.sort_values(by="processed_date_first_issue_online")
# global_txt_file = global_txt_file.groupby('title_id').progress_apply(fill_missing_disciplines)

In [None]:
# unique_disicipline_lineages = global_txt_file[global_txt_file['unique_discipline'] == False].lineage.unique().tolist()

# for lineage in unique_disicipline_lineages:
#     print(lineage)
#     print(global_txt_file[global_txt_file['lineage'] == lineage].discipline.unique())
#     print()

In [None]:
# global_txt_file.to_csv(f"{data_directory_path}/processed_jstor_files/cleaned_jstor_titles.csv", index=False)

### Load Wiki Data for Journals

In [None]:
wiki_global_txt_file = pd.read_csv(f"{data_directory_path}/processed_jstor_files/cleaned_jstor_titles_wiki.csv")
len(wiki_global_txt_file), len(global_txt_file)

In [None]:
global_txt_file['updated_discipline_count_per_lineage'] = global_txt_file.discipline_count_per_lineage

In [None]:
def update_wiki_dataframe(wiki_global_txt_file, global_txt_file):
    cols = global_txt_file.columns.tolist()
    wiki_cols = wiki_global_txt_file.columns.tolist()

    non_existing_cols = [col for col in cols if col not in wiki_cols]

    if len(non_existing_cols)> 0:
        non_existing_wiki_cols = [col for col in wiki_cols if col not in cols]

        print(non_existing_cols)
        print(non_existing_wiki_cols)

        subset_cols = ['title_id', 'parent_publication_title_id', 'preceding_publication_title_id', 'date_first_issue_online', 'lineage', 'parent']
        merged_df = wiki_global_txt_file.merge(global_txt_file[subset_cols + non_existing_cols], on=subset_cols, how='left')
        merged_df.to_csv(f"{data_directory_path}/processed_jstor_files/cleaned_jstor_titles_wiki.csv", index=False)
    else:
        merged_df = wiki_global_txt_file
    return merged_df

In [None]:
wiki_global_txt_file = update_wiki_dataframe(wiki_global_txt_file, global_txt_file)

In [None]:
len(wiki_global_txt_file), len(global_txt_file)

In [None]:
wiki_global_txt_file['qid'] = wiki_global_txt_file.wikidata_url.str.split('/').str[-1]

In [None]:
wiki_global_txt_file.qid.nunique()

#### Infer Wiki Data Based on Lineage

In [None]:
def propagate_wiki_data_within_lineage(group):
    # Sort the group by publication date
    group = group.sort_values(by='processed_date_first_issue_online')

    # Initialize variables to store the last known values
    last_wiki_url = None
    last_wikidata_url = None
    last_wikidata_title = None

    # Iterate through the group and propagate Wikipedia and Wikidata data
    for index, row in group.iterrows():
        # Update the last known values if present
        if pd.notnull(row['wikipedia_url']):
            last_wiki_url = row['wikipedia_url']
        if pd.notnull(row['wikidata_url']):
            last_wikidata_url = row['wikidata_url']
        if pd.notnull(row['wikidata_title']):
            last_wikidata_title = row['wikidata_title']
        
        # Propagate to rows missing the data
        if last_wiki_url is not None and pd.isnull(row['wikipedia_url']):
            group.at[index, 'wikipedia_url'] = last_wiki_url
        if last_wikidata_url is not None and pd.isnull(row['wikidata_url']):
            group.at[index, 'wikidata_url'] = last_wikidata_url
        if last_wikidata_title is not None and pd.isnull(row['wikidata_title']):
            group.at[index, 'wikidata_title'] = last_wikidata_title

    return group

inferred_wiki_global_txt_file = wiki_global_txt_file.copy()
# Apply the function to each lineage group
tqdm.pandas(desc="Propagate Wiki Data")
inferred_wiki_global_txt_file = inferred_wiki_global_txt_file.groupby('lineage').progress_apply(propagate_wiki_data_within_lineage)



In [None]:
print(f"So we have {len(inferred_wiki_global_txt_file[(inferred_wiki_global_txt_file.wikidata_title.notna()) | (inferred_wiki_global_txt_file.wikidata_url.notna()) | (inferred_wiki_global_txt_file.wikipedia_url.notna())])} titles with some form of Wiki data and {len(inferred_wiki_global_txt_file[(inferred_wiki_global_txt_file.wikidata_title.isna()) & (inferred_wiki_global_txt_file.wikidata_url.isna()) & (inferred_wiki_global_txt_file.wikipedia_url.isna())])} titles with no Wiki data.")

In [None]:
def has_wiki(group, column_type):
    if group['wikidata_title'].notnull().any() or group['wikidata_url'].notnull().any() or group['wikipedia_url'].notnull().any():
        group[f'{column_type}_has_wiki'] = True
    else:
        group[f'{column_type}_has_wiki'] = False
    return group

tqdm.pandas(desc="Has Wiki")
inferred_wiki_global_txt_file = inferred_wiki_global_txt_file.groupby('lineage').progress_apply(has_wiki, column_type='lineage')
inferred_wiki_global_txt_file = inferred_wiki_global_txt_file.groupby('title_id').progress_apply(has_wiki, column_type='title')

In [None]:
print(f"So we have {len(inferred_wiki_global_txt_file[(inferred_wiki_global_txt_file.title_has_wiki == True) | (inferred_wiki_global_txt_file.lineage_has_wiki == True)])} titles with some form of Wiki data and {len(inferred_wiki_global_txt_file[(inferred_wiki_global_txt_file.title_has_wiki == False) & (inferred_wiki_global_txt_file.lineage_has_wiki == False)])} titles with no Wiki data.")

In [None]:
inferred_wiki_global_txt_file.to_csv(f"{data_directory_path}/processed_jstor_files/cleaned_jstor_titles_inferred_wiki.csv", index=False)