# JSTOR Journal and Discipline EDA

This notebook explores some of the differences between disciplines based on both title and lineage. It is intended to help inform how disciplines differ and whether using title or lineage for assessing journals and disciplines is more effective.

### Load Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.renderers.enable("mimetype")
alt.data_transformers.enable('default', max_rows=None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from rich.table import Table
from rich.console import Console
from tqdm import tqdm
import ast
import sys
sys.path.append("../")
from scripts.utils import *
# Create a console object
console = Console()

In [20]:
data_directory_path = get_data_directory_path()
inferred_wiki_global_txt_file = pd.read_csv(f"{data_directory_path}/processed_jstor_files/cleaned_jstor_titles_inferred_wiki.csv")
discipline_data = inferred_wiki_global_txt_file.copy()

### Shared Functions

In [21]:
def pretty_print_data(df, numb_rows=3):
    # Create a table
    table = Table(show_header=True, header_style="bold magenta")

    # Add columns
    for column in df.columns:
        table.add_column(column)

    # Add rows
    for _, row in df.head(numb_rows).iterrows():
        table.add_row(*row.astype(str))

    # Print the table
    console.print(table)

In [22]:
def group_columns_by_data_type(df): 
    data_types = df.dtypes.reset_index().rename(columns={0: 'identified_dtype', 'index': 'column_name'})

    # Get columns that are not strings
    data_columns = data_types[data_types['identified_dtype'] != 'object']['column_name'].tolist()
    # Calculate range for each column
    ranges = df[data_columns].agg(['min', 'max']).diff().loc['max']

    # Define bins for grouping
    bins = [-np.inf, 1, 10, 100, 1000, np.inf]

    # Group columns by range
    grouped_columns = {}
    for col, range_ in ranges.items():
        for i in range(len(bins) - 1):
            if bins[i] < range_ <= bins[i + 1]:
                if bins[i] not in grouped_columns:
                    grouped_columns[bins[i]] = [col]
                else:
                    grouped_columns[bins[i]].append(col)
                break

    column_grouping = pd.DataFrame(grouped_columns.items(), columns=['range_values', 'column_name']).explode('column_name')
    return column_grouping

In [23]:
def calculate_relative_difference(df):
    # Calculate the mean of each statistic across all disciplines
    collective_means = df.mean()
    # For each discipline, calculate the difference from the collective mean
    comparison = df.copy()
    for column in df.columns:
        if 'discipline' not in column:
            comparison[column] = df[column] - collective_means[column]
    
    return comparison

def create_statistic_striplot(df, column_grouping, x_column, color_column):
    strip_charts = []
    column_grouping = column_grouping.sort_values(by=['range_values'], ascending=False)
    groupings = column_grouping.range_values.unique().tolist()
    numb_colors = df[color_column].nunique()
    color_scheme = 'spectral' if numb_colors > 10 else 'category10'
    for grouping in groupings:
        selected_columns = column_grouping[column_grouping.range_values == grouping].column_name.tolist()
        derived_height = len(selected_columns) * 50
        # Create a selection that chooses the nearest point & selects based on discipline
        selection = alt.selection_point(fields=['discipline'], bind='legend')
        strip_chart = alt.Chart(df[df.statistic.isin(selected_columns)]).mark_tick().encode(
            x=f'{x_column}:Q',
            y='statistic:N',
            color=alt.Color(f'{color_column}:N', scale=alt.Scale(scheme=color_scheme)),
            tooltip=[f'{color_column}', 'statistic', f'{x_column}'],
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
        ).add_params(selection).properties(
            width=600,
            height=derived_height
        )
        strip_charts.append(strip_chart)



    strip_plot = alt.vconcat(*strip_charts).configure_legend(
        columns=5,  # Set number of columns in legend
        orient='bottom',  # Position the legend at the bottom
        symbolLimit=1000  # Increase symbol limit to ensure all disciplines are shown
    )

    return strip_plot

In [24]:
def check_for_nulls(df, columns_to_check):
    print("Checking for any null values by title...")
    # Check if any of the values in these columns are null
    null_values = df[columns_to_check].isnull()
    combined_null_dfs = []
    # For each discipline, print out the discipline and the column names with null values
    for idx, row in null_values.iterrows():
        null_columns = row[row].index.tolist()
        if null_columns:
            print(f"Discipline: {df.loc[idx, 'discipline']}, Null columns: {null_columns}")
            print(f"Unique number of publications in this discipline: {df.loc[idx, 'publication_count']}\n")
            null_df = pd.DataFrame()
            null_df['null_columns'] = null_columns
            null_df = null_df.explode('null_columns')
            null_df['publication_count'] = df.loc[idx, 'publication_count']
            null_df['discipline'] = df.loc[idx, 'discipline']
            combined_null_dfs.append(null_df)

    finalized_null_df = pd.concat(combined_null_dfs)
    finalized_null_df = finalized_null_df.drop_duplicates()
    return finalized_null_df

In [25]:
def create_correlation_chart(stats, df, x_column, y_column, color_column):
    # Create a selection that chooses the nearest point & selects based on discipline
    selection = alt.selection_point(fields=[color_column], bind='legend')

    # Define a base chart with common properties
    base = alt.Chart().properties(
        width=200,
        height=200
    ).add_selection(
        selection
    )
    correlation_dfs = []
    # Create scatter plot and regression line for each statistic
    charts = []
    numb_colors = df[color_column].nunique()
    color_scheme = 'spectral' if numb_colors > 10 else 'category10'
    for stat in stats:
        data = df[df.statistic == stat]
        # Calculate adjusted R-squared
        X = sm.add_constant(data[x_column])
        y = data[y_column]
        model = sm.OLS(y, X).fit()
        adj_r_squared = model.rsquared_adj
        correlation_dfs.append(pd.DataFrame({'statistic': [stat], 'adj_r_squared': [adj_r_squared]}))
        x_column_name = x_column.replace('_', ' ').title()
        y_column_name = y_column.replace('_', ' ').title()
        scatter_plot = base.mark_circle().encode(
            x=alt.X(f'{x_column}:Q', axis=alt.Axis(title=x_column_name)),
            y=alt.Y(f'{y_column}:Q', axis=alt.Axis(title=y_column_name)),
            color=alt.Color(f'{color_column}:N', scale=alt.Scale(scheme=color_scheme)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),  # Adjust opacity based on selection
            tooltip=[color_column, 'statistic', x_column, y_column]
        ).add_params(
            selection
        ).properties(
            title={
                "text": f"{x_column_name} vs {y_column_name} {stat} Comparison",
                "subtitle": f"Adjusted R-squared: {adj_r_squared:.2f}",
                "color": "black",
                "subtitleColor": "gray"
            }
        )

        regression_line = base.transform_regression(
            x_column, y_column, method="linear"
        ).mark_line().encode(
            x=f'{x_column}:Q',
            y=f'{y_column}:Q',
        )

        combined_chart = alt.layer(scatter_plot, regression_line, data=data)
        charts.append(combined_chart)
    correlation_df = pd.concat(correlation_dfs)
    # Concatenate all charts
    len_stats = len(stats)
    length_row = round(len_stats / 2)
    final_chart = alt.vconcat(*[alt.hconcat(*charts[i:i+length_row]) for i in range(0, len(charts), length_row)])

    # Apply configuration to the concatenated chart
    final_chart = final_chart.configure_legend(
        columns=5,  # Set number of columns in legend
        orient='bottom',  # Position the legend at the bottom
        symbolLimit=1000  # Increase symbol limit to ensure all disciplines are shown
    )
    return final_chart, correlation_df


In [26]:
def select_most_average_discipline(df, statistic):
    stat_mean = df[statistic].mean()
    # Identify the discipline with the statistic closest to the mean
    closest_statistic = df.loc[(df[statistic] - stat_mean).abs().argsort()[:1]]
    closest_statistic = closest_statistic.iloc[0]
    # Get the discipline name
    discipline = closest_statistic.discipline
    print(f"Discipline with the most average {statistic}: {discipline}")

In [27]:
def calculate_discipline_stats(df, grouping_column, statistic_column, publication_count_column):
    # Now, group by 'discipline' to calculate the distribution statistics of publication lifespans
    discipline_stats = df.groupby(grouping_column).agg(
        mean_lifespan=pd.NamedAgg(column=statistic_column, aggfunc='mean'),
        median_lifespan=pd.NamedAgg(column=statistic_column, aggfunc='median'),
        std_lifespan=pd.NamedAgg(column=statistic_column, aggfunc='std'),
        min_lifespan=pd.NamedAgg(column=statistic_column, aggfunc='min'),
        max_lifespan=pd.NamedAgg(column=statistic_column, aggfunc='max'),
        percentile_25=pd.NamedAgg(column=statistic_column, aggfunc=lambda x: x.quantile(0.25)),
        percentile_75=pd.NamedAgg(column=statistic_column, aggfunc=lambda x: x.quantile(0.75)),
        skewness=pd.NamedAgg(column=statistic_column, aggfunc='skew'),
        kurtosis=pd.NamedAgg(column=statistic_column, aggfunc=lambda x: pd.Series(x).kurt()),  
        publication_count=pd.NamedAgg(column=publication_count_column, aggfunc='nunique')  
    ).reset_index()
    # Sort the results to find the top disciplines with the most robust lifespan metrics
    discipline_stats.sort_values(by=['publication_count', 'mean_lifespan'], ascending=[False, False], inplace=True)

    print(f"When determining discipline grouped by {grouping_column} and calculating publication length by {statistic_column} and  {publication_count_column}, then sorting by (`publication_count`) and mean length of publication run (`mean_lifespan`) the top 3 disciplines are:")
    pretty_print_data(discipline_stats, 3)
    # Calculating Z-scores for 'mean_lifespan' in title
    total_mean = discipline_stats['mean_lifespan'].mean()
    total_std = discipline_stats['mean_lifespan'].std()
    discipline_stats['mean_lifespan_zscore'] = discipline_stats['mean_lifespan'].apply(lambda x: (x - total_mean) / total_std)
    discipline_stats['mean_lifespan_cv'] = discipline_stats['std_lifespan'] / discipline_stats['mean_lifespan']

    discipline_stats.sort_values(by=['mean_lifespan_zscore'], ascending=False, inplace=True)
    print("----------------------------------------")
    print(f"We can also calculate the z-score for the mean lifespan of each discipline. The z-score is a measure of how many standard deviations a discipline's mean lifespan is from the mean lifespan of all disciplines. A high z-score indicates that the mean lifespan of the discipline is far from the mean lifespan of all disciplines, in terms of standard deviations. This could suggest that the discipline is particularly unique or different in some way. The top 3 disciplines by z-score are:\n{discipline_stats[grouping_column].head(3).values}. These disciplines have mean lifespans that are significantly different from the average.")

    discipline_stats.sort_values(by=['mean_lifespan_cv'], ascending=False, inplace=True)
    print("----------------------------------------")
    print(f"We can also calculate the coefficient of variation for the mean lifespan of each discipline. The top 3 disciplines by coefficient of variation are:\n{discipline_stats[grouping_column].head(3).values}. A high coefficient of variation indicates that the lifespans of journals within these disciplines are highly varied, with some journals having lifespans much longer or shorter than the average.")
    return discipline_stats

In [28]:
def create_correlation_matrix_and_pivot_table(df, column_index, column_columns):
    if column_index not in df.columns or column_columns not in df.columns:
        raise ValueError(f"Columns {column_index} or {column_columns} not found in DataFrame")

    matrix_df = df[[column_index, column_columns]]
    pivot_df = pd.pivot_table(matrix_df, index=column_index, columns=column_columns, aggfunc=len, fill_value=0)
    
    correlation_matrix = pivot_df.corr()
    correlation_matrix.reset_index(inplace=True)
    correlation_matrix.index.name = None
    return correlation_matrix, pivot_df

def generate_matrix_heatmap(correlation_matrix, column_index, title=None):
    # Melt the correlation matrix into a long-form DataFrame
    long_form = correlation_matrix.melt(id_vars=[f'{column_index}'], var_name='column', value_name='correlation')
    finalized_title = f'{title.capitalize()} Correlation Heatmap' if title else f'{column_index.capitalize()} Correlation Heatmap'
    # Create the heatmap with Altair
    heatmap = alt.Chart(long_form).mark_rect().encode(
        x=alt.X(f'{column_index}:O', axis=alt.Axis(labelAngle=-45)),  # This will sort the x-axis based on the y-axis values in descending order
        y=alt.Y('column:O', sort=alt.EncodingSortField('column', order='descending')),
        color='correlation:Q',
        tooltip=[f'{column_index}', 'column', 'correlation']
    ).properties(
        title=finalized_title,
    ).configure_axis(
        labelFontSize=10,  # Adjust font size if needed
        labelPadding=10,  # Adjust label padding if needed
    )

    return heatmap, long_form

### Exploratory Data Analysis

#### Disciplinary Statistics

First, let's take a look athe consistency of the number of journals per discipline over time using a number of statistics and comparing both if we calculate by individual titles or combined lineage.

The metrics represent the following values:

- `mean_lifespan`: The average (mean) number of active years for publications within a given discipline. This metric gives an overall idea of how long publications typically remain active in that discipline.
- `median_lifespan`: The median lifespan of publications. This is the middle value in the range of lifespans and can be more representative than the mean, especially if the data contains outliers.
- `std_lifespan`: The standard deviation of the lifespan. This metric gives an idea of how much the lifespan varies within a discipline. A high standard deviation indicates that publications within a discipline have a wide range of lifespans, while a low standard deviation indicates that publications within a discipline have similar lifespans.
- `min_lifespan`: The minimum lifespan of publications. This is the shortest lifespan of any publication within a discipline.
- `max_lifespan`: The maximum lifespan of publications. This is the longest lifespan of any publication within a discipline.
- `percentile_25`: The 25th percentile of lifespans. This is the lifespan that 25% of publications fall below.
- `percentile_75`: The 75th percentile of lifespans. This is the lifespan that 75% of publications fall below.
- `skewness`: The skewness of the lifespan distribution. This metric gives an idea of how symmetrical the lifespan distribution is. A positive skewness indicates that the lifespan distribution is skewed to the right, meaning that there are more publications with shorter lifespans than longer lifespans. A negative skewness indicates that the lifespan distribution is skewed to the left, meaning that there are more publications with longer lifespans than shorter lifespans. A skewness of 0 indicates that the lifespan distribution is symmetrical.
- `kurtosis`: The kurtosis of the lifespan distribution. This metric gives an idea of how peaked the lifespan distribution is. A positive kurtosis indicates that the lifespan distribution is more peaked than a normal distribution, meaning that there are more publications with lifespans close to the mean lifespan than a normal distribution. A negative kurtosis indicates that the lifespan distribution is less peaked than a normal distribution, meaning that there are fewer publications with lifespans close to the mean lifespan than a normal distribution. A kurtosis of 0 indicates that the lifespan distribution is similar to a normal distribution.
- `publication_count`: The number of unique publications within a discipline.
- `mean_lifespan_zscore`: The mean lifespan z-score of publications within a discipline. This metric gives an idea of how long publications within a discipline remain active compared to the average lifespan of all publications. A positive z-score indicates that publications within a discipline remain active longer than the average lifespan of all publications. A negative z-score indicates that publications within a discipline remain active shorter than the average lifespan of all publications. A z-score of 0 indicates that publications within a discipline remain active for the same amount of time as the average lifespan of all publications.
- `mean_lifespan_cv`: The coefficient of variation of the mean lifespan of publications within a discipline. This metric gives an idea of how much the mean lifespan of publications within a discipline varies compared to the average lifespan of all publications. A high coefficient of variation indicates that the lifespans of journals within these disciplines are highly varied, with some journals having lifespans much longer or shorter than the average. A low coefficient of variation indicates that the lifespans of journals within these disciplines are similar to the average lifespan of all publications.

#### Disciplinary Stats By Unique Publication Title

In [29]:
# Sort the DataFrame by 'date_first_issue_online'
discipline_data = discipline_data.sort_values(by=['date_first_issue_online'])

# Create 'unique_id' by appending a count to 'title_id'
discipline_data['unique_id'] = discipline_data.groupby('title_id').cumcount() + 1
discipline_data['unique_id'] = discipline_data['title_id'] + '_' + discipline_data['unique_id'].astype(str)

discipline_data.discipline = discipline_data.discipline.str.split(";")
exploded_titles_disciplines = discipline_data.explode("discipline")
exploded_titles_disciplines.discipline = exploded_titles_disciplines.discipline.str.strip()

In [30]:
# Now, group by 'discipline' to calculate the distribution statistics of publication lifespans
title_discipline_stats = calculate_discipline_stats(exploded_titles_disciplines, 'discipline', 'title_active_years', 'publication_title')

When determining discipline grouped by discipline and calculating publication length by title_active_years and  publication_title, then sorting by (`publication_count`) and mean length of publication run (`mean_lifespan`) the top 3 disciplines are:


----------------------------------------
We can also calculate the z-score for the mean lifespan of each discipline. The z-score is a measure of how many standard deviations a discipline's mean lifespan is from the mean lifespan of all disciplines. A high z-score indicates that the mean lifespan of the discipline is far from the mean lifespan of all disciplines, in terms of standard deviations. This could suggest that the discipline is particularly unique or different in some way. The top 3 disciplines by z-score are:
['Astronomy' 'Geology' 'Classical Studies']. These disciplines have mean lifespans that are significantly different from the average.
----------------------------------------
We can also calculate the coefficient of variation for the mean lifespan of each discipline. The top 3 disciplines by coefficient of variation are:
['Garden & Landscape' 'Folklore' 'Irish Studies']. A high coefficient of variation indicates that the lifespans of journals within these disciplines are 

In [31]:
melted_title_discipline_stats = pd.melt(title_discipline_stats, id_vars=['discipline',], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='title_statistic_value')

title_grouped_columns = group_columns_by_data_type(title_discipline_stats)

title_strip_plot = create_statistic_striplot(melted_title_discipline_stats, title_grouped_columns, 'title_statistic_value', 'discipline')
title_strip_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


Visualizing this data we can start to see some of the overall trends by discipline. For example, `Social Sciences` has the highest number of publications, but `History` has the max length of publication. `History` also has the highest number of publications per year, but `Social Sciences` has the highest number of publications per year per journal. `Geology` has the highest standard deviation of publications lifespan, which indicates that it has the most variation in the length of time that journals are published. We can also see that `Astronomy` is an outlier in `mean_lifespan` (and across a number of other statistics) which indicates that it is a very different discipline than the others (or that some thing is off in terms of number of publications).

In [32]:
for index, row in title_grouped_columns.iterrows():
    col = row.column_name
    select_most_average_discipline(title_discipline_stats, col)

Discipline with the most average mean_lifespan: Political Science
Discipline with the most average median_lifespan: History
Discipline with the most average min_lifespan: Technology
Discipline with the most average max_lifespan: American Studies
Discipline with the most average percentile_25: Security Studies
Discipline with the most average percentile_75: Environmental Science
Discipline with the most average std_lifespan: History
Discipline with the most average kurtosis: Asian Studies
Discipline with the most average skewness: Public Health
Discipline with the most average mean_lifespan_zscore: Political Science
Discipline with the most average publication_count: Arts
Discipline with the most average mean_lifespan_cv: Health Policy


We can also calculate how different these values are for each discipline (i.e. if we have the idea of a standard disicipline based on this data, then how do all other disciplines compare).

In [33]:
title_comparison = calculate_relative_difference(title_discipline_stats)

melted_title_comparison = pd.melt(title_comparison, id_vars=['discipline',], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='difference_to_disciplines_mean')

In [34]:

diff_title_strip_plot = create_statistic_striplot(melted_title_comparison, title_grouped_columns, 'difference_to_disciplines_mean', 'discipline')
diff_title_strip_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


Now the key thing to understand with this chart is that it is the relative difference of a discipline compared the mean for that statistic. The choice to visualize the relative difference is helping us get a sense of not just the distribution, but specifically, how much that differs relative to all disciplines combined. Many of our top outliers remain steady, we can see a much wider spread in our distribution which intuitively makes sense considering that we are comparing the difference of each discipline to the mean of all disciplines.

##### Disciplinary Stats by Unique Lineage

In [35]:
discipline_data.aggregated_disciplines = discipline_data.aggregated_disciplines.str.split(";")
exploded_lineage_disciplines = discipline_data.explode("aggregated_disciplines")
exploded_lineage_disciplines.aggregated_disciplines = exploded_lineage_disciplines.aggregated_disciplines.str.strip()


In [36]:
# Now, group by 'discipline' to calculate the distribution statistics of publication lifespans
lineage_discipline_stats = calculate_discipline_stats(exploded_lineage_disciplines, 'aggregated_disciplines', 'lineage_active_years', 'publication_title')


When determining discipline grouped by aggregated_disciplines and calculating publication length by lineage_active_years and  publication_title, then sorting by (`publication_count`) and mean length of publication run (`mean_lifespan`) the top 3 disciplines are:


----------------------------------------
We can also calculate the z-score for the mean lifespan of each discipline. The z-score is a measure of how many standard deviations a discipline's mean lifespan is from the mean lifespan of all disciplines. A high z-score indicates that the mean lifespan of the discipline is far from the mean lifespan of all disciplines, in terms of standard deviations. This could suggest that the discipline is particularly unique or different in some way. The top 3 disciplines by z-score are:
['General Science' 'Mathematics' 'Astronomy']. These disciplines have mean lifespans that are significantly different from the average.
----------------------------------------
We can also calculate the coefficient of variation for the mean lifespan of each discipline. The top 3 disciplines by coefficient of variation are:
['Environmental Science' 'Irish Studies' 'Biological Sciences']. A high coefficient of variation indicates that the lifespans of journals within these 

In [37]:
lineage_discipline_stats = lineage_discipline_stats.rename(columns={'aggregated_disciplines': 'discipline'})

melted_lineage_discipline_stats = pd.melt(lineage_discipline_stats, id_vars=['discipline',], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='lineage_statistic_value')

lineage_grouped_columns = group_columns_by_data_type(lineage_discipline_stats)

lineage_strip_plot = create_statistic_striplot(melted_lineage_discipline_stats, lineage_grouped_columns, 'lineage_statistic_value', 'discipline')
lineage_strip_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [38]:
for index, row in lineage_grouped_columns.iterrows():
    col = row.column_name
    select_most_average_discipline(lineage_discipline_stats, col)

Discipline with the most average mean_lifespan: Military Studies
Discipline with the most average median_lifespan: Social Work
Discipline with the most average std_lifespan: Education
Discipline with the most average min_lifespan: Medicine & Allied Health
Discipline with the most average max_lifespan: Communication Studies
Discipline with the most average percentile_25: Law
Discipline with the most average percentile_75: Classical Studies
Discipline with the most average skewness: Feminist & Women's Studies
Discipline with the most average mean_lifespan_zscore: Military Studies
Discipline with the most average mean_lifespan_cv: General Science
Discipline with the most average kurtosis: Ecology & Evolutionary Biology
Discipline with the most average publication_count: Arts


Again unsurprisingly, using lineage instead of publication_title for calculating length leads to a differing categorization of disciplines. Whereas PoliSci was the most average discipline previously, now we are seeing `Military Studies`.

##### Comparing Disciplinary Stats by Unique Title and Lineage

###### Visualize Comparison

In [39]:
merged_discipline_stats = melted_lineage_discipline_stats.merge(melted_title_discipline_stats, on=['discipline', 'statistic'], how='left')
stats = merged_discipline_stats.statistic.unique().tolist()
# subset merged_discipline_stats to only include rows where both lineage_statistic_value and title_statistic_value are not null
subset_merged_discipline_stats = merged_discipline_stats[(merged_discipline_stats.lineage_statistic_value.notna()) & (merged_discipline_stats.title_statistic_value.notna())]
print("The values that are null in the merged DataFrame are:")
merged_discipline_stats[(merged_discipline_stats.lineage_statistic_value.isna()) | (merged_discipline_stats.title_statistic_value.isna())]

The values that are null in the merged DataFrame are:


Unnamed: 0,discipline,statistic,lineage_statistic_value,title_statistic_value
269,Astronomy,std_lifespan,,
719,Astronomy,skewness,,
809,Astronomy,kurtosis,,
1079,Astronomy,mean_lifespan_cv,,


In [40]:
merged_correlation_chart, merged_correlation_df = create_correlation_chart(stats, subset_merged_discipline_stats, 'lineage_statistic_value', 'title_statistic_value', 'discipline')
merged_correlation_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


##### Publication Counts

Our only value with almost a perfect correlation is publication_count so let's combine the two and see if we can get a better sense of publications per discipline, as well as if disciplines with fewer publications are skewing our results.

In [41]:
pub_counts = merged_discipline_stats[merged_discipline_stats.statistic == "publication_count"]

pub_counts['finalized_publication_counts'] = np.maximum(pub_counts['lineage_statistic_value'], pub_counts['title_statistic_value'])

# Calculate mean of finalized_publication_counts
mean_value = pub_counts['finalized_publication_counts'].mean()

# Create base chart
base = alt.Chart(pub_counts).mark_bar().encode(
    x=alt.X('finalized_publication_counts:Q', bin=alt.Bin(maxbins=100), axis=alt.Axis(title='Finalized Publication Count')),
    y=alt.Y('count():Q', axis=alt.Axis(title='Number of Disciplines')),
    tooltip=['finalized_publication_counts', 'count()']
).properties(
    title={
        "text": "Finalized Publication Count Distribution",
        "subtitle": ["The finalized publication count is the maximum of the lineage and title publication counts.", f"The mean of the finalized publication counts is {mean_value:.2f}"],
        "color": "black",
        "subtitleColor": "gray"
    }
)

# Create mean line
mean_line = alt.Chart(pd.DataFrame({'mean_value': [mean_value]})).mark_rule(color='red').encode(
    x='mean_value:Q'
)

# Combine base chart and mean line
chart = alt.layer(base, mean_line)

chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


We can see that our data is pretty unevenly distributed, with a few disciplines having a large number of publications and the majority having a small number. Let's remove the lowest 10% of disciplines by publication count and see how this impacts our evaluation of disciplines.

In [42]:
lineage_less_than_10 = pub_counts.lineage_statistic_value.describe(percentiles=[0.1])['10%']
title_less_than_10 = pub_counts.title_statistic_value.describe(percentiles=[0.1])['10%']
less_than_10_value = np.maximum(lineage_less_than_10, title_less_than_10)

too_few_pub_counts = pub_counts[(pub_counts.statistic == "publication_count") & (pub_counts.finalized_publication_counts < less_than_10_value)]

print(f"The disciplines with a finalized publication count less than {less_than_10_value:.2f} are:\n{too_few_pub_counts.discipline.values}")

subset_pub_counts = pub_counts[(pub_counts.statistic == "publication_count") & (pub_counts.finalized_publication_counts > less_than_10_value)]

The disciplines with a finalized publication count less than 12.90 are:
['Geology' 'Food Studies' 'Transportation Studies' 'Garden & Landscape'
 'American Indian Studies' 'Horticulture' 'Computer Science'
 'Developmental & Cell Biology' 'Astronomy']


You'll notice that many of these disciplines were top values for our zscore and cv calculations. This is because disciplines with a small number of publications will have a high zscore and cv because the mean lifespan of publications within that discipline will be highly varied compared to the average lifespan of all publications.

Let's recalculate but remove the lowest 10% of disciplines by publication count.

###### Recalculate Disciplinary Stats by Unique Title and Lineage

In [43]:
subset_lineage_discipline_stats = lineage_discipline_stats[lineage_discipline_stats.discipline.isin(subset_pub_counts.discipline)]

# recalculate z-score for mean lifespan
subset_lineage_mean = subset_lineage_discipline_stats['mean_lifespan'].mean()
subset_lineage_std = subset_lineage_discipline_stats['mean_lifespan'].std()
subset_lineage_discipline_stats['mean_lifespan_zscore'] = subset_lineage_discipline_stats['mean_lifespan'].apply(lambda x: (x - subset_lineage_mean) / subset_lineage_std)

# recalculate coefficient of variation for mean lifespan
subset_lineage_discipline_stats['mean_lifespan_cv'] = subset_lineage_discipline_stats['std_lifespan'] / subset_lineage_discipline_stats['mean_lifespan']


subset_melted_lineage_discipline_stats = pd.melt(subset_lineage_discipline_stats, id_vars=['discipline'], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='lineage_statistic_value')

subset_title_discipline_stats = title_discipline_stats[title_discipline_stats.discipline.isin(subset_pub_counts.discipline)]

# recalculate z-score for mean lifespan
subset_title_mean = subset_title_discipline_stats['mean_lifespan'].mean()
subset_title_std = subset_title_discipline_stats['mean_lifespan'].std()
subset_title_discipline_stats['mean_lifespan_zscore'] = subset_title_discipline_stats['mean_lifespan'].apply(lambda x: (x - subset_title_mean) / subset_title_std)

# recalculate coefficient of variation for mean lifespan
subset_title_discipline_stats['mean_lifespan_cv'] = subset_title_discipline_stats['std_lifespan'] / subset_title_discipline_stats['mean_lifespan']

subset_melted_title_discipline_stats = pd.melt(subset_title_discipline_stats, id_vars=['discipline'], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='title_statistic_value')

combined_subset_merged_discipline_stats = subset_melted_lineage_discipline_stats.merge(subset_melted_title_discipline_stats, on=['discipline', 'statistic'], how='left')
combined_subset_merged_discipline_stats = combined_subset_merged_discipline_stats[(combined_subset_merged_discipline_stats.lineage_statistic_value.notna()) & (combined_subset_merged_discipline_stats.title_statistic_value.notna())]

updated_stats = combined_subset_merged_discipline_stats.statistic.unique().tolist()
print("The values that are null in the merged DataFrame are:")
combined_subset_merged_discipline_stats[(combined_subset_merged_discipline_stats.lineage_statistic_value.isna()) | (combined_subset_merged_discipline_stats.title_statistic_value.isna())]

The values that are null in the merged DataFrame are:


Unnamed: 0,discipline,statistic,lineage_statistic_value,title_statistic_value


In [44]:
subset_merged_correlation_chart, subset_merged_correlation_df = create_correlation_chart(updated_stats, combined_subset_merged_discipline_stats, 'lineage_statistic_value', 'title_statistic_value', 'discipline')
subset_merged_correlation_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


##### Interpret Results

While these graphs are helpful, I want to compare across these experiments so let's combined them into a single dataframe.

In [45]:
full_dataset = 'All Disciplines'
subset_dataset = 'Disciplines with > 10 Publications'
merged_correlation_df['dataset'] = full_dataset
subset_merged_correlation_df['dataset'] = subset_dataset

comparison_df = pd.concat([merged_correlation_df, subset_merged_correlation_df])
pivoted_comparison_df = pd.pivot_table(comparison_df, index=['statistic'], columns=['dataset'], values=['adj_r_squared']).reset_index()

pivoted_comparison_df.columns = ['statistic', full_dataset, subset_dataset]
pivoted_comparison_df['difference'] = pivoted_comparison_df[full_dataset] - pivoted_comparison_df[subset_dataset]
pivoted_comparison_df['abs_difference'] = abs(pivoted_comparison_df[full_dataset] - pivoted_comparison_df[subset_dataset])
print("The difference in adjusted R-squared values between the full dataset and the subset dataset is:")
pivoted_comparison_df.sort_values(by=['difference'], ascending=False)

The difference in adjusted R-squared values between the full dataset and the subset dataset is:


Unnamed: 0,statistic,All Disciplines,Disciplines with > 10 Publications,difference,abs_difference
6,min_lifespan,0.746553,0.005354,0.741199,0.741199
7,percentile_25,0.453505,0.169585,0.283921,0.283921
5,median_lifespan,0.199436,0.139632,0.059804,0.059804
10,skewness,0.144905,0.101836,0.043069,0.043069
0,kurtosis,0.009784,-0.003266,0.01305,0.01305
1,max_lifespan,0.520694,0.509999,0.010694,0.010694
9,publication_count,0.999906,0.999904,2e-06,2e-06
3,mean_lifespan_cv,0.108101,0.11394,-0.005839,0.005839
8,percentile_75,0.094818,0.118504,-0.023686,0.023686
11,std_lifespan,0.168001,0.200613,-0.032612,0.032612


We can see from the `difference` column that subsetting disciplines to those with more than 12 publications does influence a number of our metrics. Overall, any difference that is positive, indicates that title and lineage for calculating disciplines are more strongly correlated vs when we remove that bottom 10% of disciplines. This is likely because the disciplines with fewer publications are skewing our results. But regardless, it seems clear that using title vs lineage has a significant impact on how disciplines are calculated and potentially characterized.

As a general rule, it seems like removing the lowest 10% of disciplines by publication count is a good way to remove some of the variability from our data. However, it's important to note that this is a somewhat arbitrary cutoff and we could do more analysis to determine the best cutoff for removing disciplines before it starts radically changing our results (though the min_lifespan does change singificantly, likely because Astronomy's sole journal is ~120 years in length).

However, it is worth noting that the mean_lifespan_cv does not change significantly between the two experiments, which may indicate that removing the lowest 10% of disciplines by publication count does not significantly impact how disciplines vary compared to the average lifespan of all publications.

##### Derived Disciplines Groupings

So even if we can leave in all the data (though seems like we should subset), we still need to determine the impact of lineage vs publication title length on characterizing discipline. We've seen that there are some differences between disciplines based on title and lineage, but what about when we group disciplines together? Using ChatGPT, I categorized the disciplines into 3 groups: STEM, Social Sciences, and Humanities. Let's see how these groups compare to the full dataset.

##### Discipline Categorization

In [46]:
discipline_categories = {
    'Social Sciences': 'Social Sciences',
    'Humanities': 'Humanities',
    'Area Studies': 'Social Sciences',
    'Science & Mathematics': 'STEM',
    'Language & Literature': 'Humanities',
    'History': 'Humanities',
    'Arts': 'Humanities',
    'Business & Economics': 'Social Sciences',
    'Political Science': 'Social Sciences',
    'Business': 'Social Sciences',
    'Biological Sciences': 'STEM',
    'Education': 'Social Sciences',
    'Art & Art History': 'Humanities',
    'Economics': 'Social Sciences',
    'Law': 'Social Sciences',
    'Sociology': 'Social Sciences',
    'Security Studies': 'Social Sciences',
    'Religion': 'Humanities',
    'Medicine & Allied Health': 'STEM',
    'Asian Studies': 'Social Sciences',
    'Sustainability': 'STEM',
    'Archaeology': 'Social Sciences',
    'Philosophy': 'Humanities',
    'Botany & Plant Sciences': 'STEM',
    'International Relations': 'Social Sciences',
    'Anthropology': 'Social Sciences',
    'American Studies': 'Social Sciences',
    'Mathematics': 'STEM',
    'Music': 'Humanities',
    'Ecology & Evolutionary Biology': 'STEM',
    'Peace & Conflict Studies': 'Social Sciences',
    'Classical Studies': 'Humanities',
    'Health Sciences': 'STEM',
    'Public Health': 'Social Sciences',
    'Zoology': 'STEM',
    'Middle East Studies': 'Social Sciences',
    'African Studies': 'Social Sciences',
    'Linguistics': 'Social Sciences',
    'Latin American Studies': 'Social Sciences',
    'Jewish Studies': 'Humanities',
    'General Science': 'STEM',
    'Architecture & Architectural History': 'Humanities',
    'Environmental Science': 'STEM',
    'Irish Studies': 'Humanities',
    'Health Policy': 'Social Sciences',
    'Management & Organizational Behavior': 'Social Sciences',
    'Statistics': 'STEM',
    'Finance': 'Social Sciences',
    'History of Science & Technology': 'STEM',
    'Public Policy & Administration': 'Social Sciences',
    'Geography': 'Social Sciences',
    'Psychology': 'Social Sciences',
    'Population Studies': 'Social Sciences',
    "Feminist & Women's Studies": 'Social Sciences',
    'Development Studies': 'Social Sciences',
    'Military Studies': 'Social Sciences',
    'Performing Arts': 'Humanities',
    'Social Work': 'Social Sciences',
    'Labor & Employment Relations': 'Social Sciences',
    'Bibliography': 'Humanities',
    'Slavic Studies': 'Humanities',
    'Science & Technology Studies': 'STEM',
    'Folklore': 'Humanities',
    'African American Studies': 'Social Sciences',
    'Gender Studies': 'Social Sciences',
    'European Studies': 'Social Sciences',
    'Technology': 'STEM',
    'Film Studies': 'Humanities',
    'Library Science': 'Social Sciences',
    'Agriculture': 'STEM',
    'Aquatic Sciences': 'STEM',
    'Museum Studies': 'Humanities',
    'Urban Studies': 'Social Sciences',
    'Engineering': 'STEM',
    'Environmental Studies': 'STEM',
    'Cultural Studies': 'Social Sciences',
    'Criminology & Criminal Justice': 'Social Sciences',
    'British Studies': 'Humanities',
    'Marketing & Advertising': 'Social Sciences',
    'Paleontology': 'STEM',
    'Communication Studies': 'Social Sciences',
    'Horticulture': 'STEM',
    'Garden & Landscape': 'Humanities',
    'Computer Science': 'STEM',
    'Transportation Studies': 'STEM',
    'American Indian Studies': 'Social Sciences',
    'Developmental & Cell Biology': 'STEM',
    'Geology': 'STEM',
    'Food Studies': 'Social Sciences',
    'Astronomy': 'STEM'
}
disciplinary_designations = pd.DataFrame.from_dict(discipline_categories, orient='index', columns=['discipline_category']).reset_index().rename(columns={'index': 'discipline'})


In [47]:
updated_subset_merged_discipline_stats = subset_merged_discipline_stats.merge(disciplinary_designations, on='discipline', how='left')
category_stats = updated_subset_merged_discipline_stats.statistic.unique().tolist()

In [48]:
updated_correlation_chart, updated_correlation_df = create_correlation_chart(category_stats, updated_subset_merged_discipline_stats, 'lineage_statistic_value', 'title_statistic_value', 'discipline_category')
updated_correlation_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


Using discipline_category does not seem to show any clear distinct clusters, so let's see if we recalculate our data by disciplinary category if we find those distinctions.

In [49]:
exploded_titles_disciplines['discipline_category'] = exploded_titles_disciplines.discipline.map(discipline_categories)
exploded_lineage_disciplines['discipline_category'] = exploded_lineage_disciplines.aggregated_disciplines.map(discipline_categories)

In [50]:
# Now, group by 'discipline' to calculate the distribution statistics of publication lifespans
title_discipline_category_stats = calculate_discipline_stats(exploded_titles_disciplines, 'discipline_category', 'title_active_years', 'publication_title')


When determining discipline grouped by discipline_category and calculating publication length by title_active_years and  publication_title, then sorting by (`publication_count`) and mean length of publication run (`mean_lifespan`) the top 3 disciplines are:


----------------------------------------
We can also calculate the z-score for the mean lifespan of each discipline. The z-score is a measure of how many standard deviations a discipline's mean lifespan is from the mean lifespan of all disciplines. A high z-score indicates that the mean lifespan of the discipline is far from the mean lifespan of all disciplines, in terms of standard deviations. This could suggest that the discipline is particularly unique or different in some way. The top 3 disciplines by z-score are:
['Humanities' 'STEM' 'Social Sciences']. These disciplines have mean lifespans that are significantly different from the average.
----------------------------------------
We can also calculate the coefficient of variation for the mean lifespan of each discipline. The top 3 disciplines by coefficient of variation are:
['STEM' 'Humanities' 'Social Sciences']. A high coefficient of variation indicates that the lifespans of journals within these disciplines are highly varied,

In [51]:

melted_title_discipline_category_stats = pd.melt(title_discipline_category_stats, id_vars=['discipline_category',], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='title_statistic_value')

title_discipline_category_grouped_columns = group_columns_by_data_type(title_discipline_category_stats)

title_discipline_category_strip_plot = create_statistic_striplot(melted_title_discipline_category_stats, title_discipline_category_grouped_columns, 'title_statistic_value', 'discipline_category')
title_discipline_category_strip_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [34]:
lineage_discipline_category_stats = calculate_discipline_stats(exploded_lineage_disciplines, 'discipline_category', 'lineage_active_years', 'publication_title')

When determining discipline grouped by discipline_category and calculating publication length by lineage_active_years and  publication_title, then sorting by (`publication_count`) and mean length of publication run (`mean_lifespan`) the top 3 disciplines are:


----------------------------------------
We can also calculate the z-score for the mean lifespan of each discipline. The z-score is a measure of how many standard deviations a discipline's mean lifespan is from the mean lifespan of all disciplines. A high z-score indicates that the mean lifespan of the discipline is far from the mean lifespan of all disciplines, in terms of standard deviations. This could suggest that the discipline is particularly unique or different in some way. The top 3 disciplines by z-score are:
['STEM' 'Humanities' 'Social Sciences']. These disciplines have mean lifespans that are significantly different from the average.
----------------------------------------
We can also calculate the coefficient of variation for the mean lifespan of each discipline. The top 3 disciplines by coefficient of variation are:
['STEM' 'Social Sciences' 'Humanities']. A high coefficient of variation indicates that the lifespans of journals within these disciplines are highly varied,

In [35]:

melted_lineage_discipline_category_stats = pd.melt(lineage_discipline_category_stats, id_vars=['discipline_category',], value_vars=['mean_lifespan', 'median_lifespan', 'std_lifespan', 'min_lifespan', 'max_lifespan', 'percentile_25', 'percentile_75', 'skewness', 'kurtosis', 'publication_count', 'mean_lifespan_zscore', 'mean_lifespan_cv'], var_name='statistic', value_name='lineage_statistic_value')

lineage_discipline_category_grouped_columns = group_columns_by_data_type(lineage_discipline_category_stats)

lineage_discipline_category_strip_plot = create_statistic_striplot(melted_lineage_discipline_category_stats, lineage_discipline_category_grouped_columns, 'lineage_statistic_value', 'discipline_category')
lineage_discipline_category_strip_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [36]:
merged_discipline_category_stats = melted_lineage_discipline_category_stats.merge(melted_title_discipline_category_stats, on=['discipline_category', 'statistic'], how='left')
discipline_category_stats = merged_discipline_category_stats.statistic.unique().tolist()
# subset merged_discipline_stats to only include rows where both lineage_statistic_value and title_statistic_value are not null
subset_merged_discipline_category_stats = merged_discipline_category_stats[(merged_discipline_category_stats.lineage_statistic_value.notna()) & (merged_discipline_category_stats.title_statistic_value.notna())]
print("The values that are null in the merged DataFrame are:")
merged_discipline_category_stats[(merged_discipline_category_stats.lineage_statistic_value.isna()) | (merged_discipline_category_stats.title_statistic_value.isna())]

The values that are null in the merged DataFrame are:


Unnamed: 0,discipline_category,statistic,lineage_statistic_value,title_statistic_value


In [37]:
merged_discipline_category_correlation_chart, merged_discipline_category_correlation_df = create_correlation_chart(discipline_category_stats, subset_merged_discipline_category_stats, 'lineage_statistic_value', 'title_statistic_value', 'discipline_category')
merged_discipline_category_correlation_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [38]:
full_dataset = 'All Disciplines'
subset_dataset = 'Discipline Categories'
merged_correlation_df['dataset'] = full_dataset
merged_discipline_category_correlation_df['dataset'] = subset_dataset

category_comparison_df = pd.concat([merged_correlation_df, subset_merged_correlation_df])
pivoted_category_comparison_df = pd.pivot_table(category_comparison_df, index=['statistic'], columns=['dataset'], values=['adj_r_squared']).reset_index()

pivoted_category_comparison_df.columns = ['statistic', full_dataset, subset_dataset]
pivoted_category_comparison_df['difference'] = pivoted_category_comparison_df[full_dataset] - pivoted_category_comparison_df[subset_dataset]
pivoted_category_comparison_df['abs_difference'] = abs(pivoted_category_comparison_df[full_dataset] - pivoted_category_comparison_df[subset_dataset])
print("The difference in adjusted R-squared values between the full dataset and the subset dataset is:")
pivoted_category_comparison_df.sort_values(by=['difference'], ascending=False)

The difference in adjusted R-squared values between the full dataset and the subset dataset is:


Unnamed: 0,statistic,All Disciplines,Discipline Categories,difference,abs_difference
6,min_lifespan,0.746553,0.005354,0.741199,0.741199
7,percentile_25,0.453505,0.169585,0.283921,0.283921
5,median_lifespan,0.199436,0.139632,0.059804,0.059804
10,skewness,0.144905,0.101836,0.043069,0.043069
0,kurtosis,0.009784,-0.003266,0.01305,0.01305
1,max_lifespan,0.520694,0.509999,0.010694,0.010694
9,publication_count,0.999906,0.999904,2e-06,2e-06
3,mean_lifespan_cv,0.108101,0.11394,-0.005839,0.005839
8,percentile_75,0.094818,0.118504,-0.023686,0.023686
11,std_lifespan,0.168001,0.200613,-0.032612,0.032612


In [39]:
pivoted_comparison_df.sort_values(by=['difference'], ascending=False)

Unnamed: 0,statistic,All Disciplines,Disciplines with > 10 Publications,difference,abs_difference
6,min_lifespan,0.746553,0.005354,0.741199,0.741199
7,percentile_25,0.453505,0.169585,0.283921,0.283921
5,median_lifespan,0.199436,0.139632,0.059804,0.059804
10,skewness,0.144905,0.101836,0.043069,0.043069
0,kurtosis,0.009784,-0.003266,0.01305,0.01305
1,max_lifespan,0.520694,0.509999,0.010694,0.010694
9,publication_count,0.999906,0.999904,2e-06,2e-06
3,mean_lifespan_cv,0.108101,0.11394,-0.005839,0.005839
8,percentile_75,0.094818,0.118504,-0.023686,0.023686
11,std_lifespan,0.168001,0.200613,-0.032612,0.032612


Perhaps unsurprisingly, grouping by disicplinary category actually gives us similar scores to when we subsetted by publication count. This is likely because the disciplines that are most different are those with the fewest publications.

#### Correlation Matrices

#### General Conclusions and Final Datasets

1. Either subsetting or using disciplinary categories seems to be a good way to remove some of the variability from our data.
2. Using lineage vs publication title has a significant impact on how disciplines are calculated and potentially characterized.
3. Generating metadata for each publication title therefore should use a combination of both title and lineage to ensure that we are capturing the most accurate discipline for each publication, and then either subset the data or also utilize discipline categories for generating this data.

As our final step, let's generate this derived data for each discipline and publication title.

For disciplines, we want the following data:

- overall statistics for each discipline derived by lineage and publication title
- relative statistics for each discipline based on the mean of all disciplines derived by lineage and publication title
- relative statistics for each discipline based on discipline category derived by lineage and publication title

For publication titles, we want to use this disciplinary data to generate a profile for each publication title that includes:

- how this publication title compares to the mean of all disciplines derived by lineage and publication title
- how this publication title compares to discipline categories derived by lineage and publication title

In [40]:
title_comparison = calculate_relative_difference(title_discipline_stats)

subset_columns = ['mean_lifespan', 'mean_lifespan_cv', 'mean_lifespan_zscore', 'publication_count']
subset_title_discipline_stats = title_discipline_stats[['discipline'] + subset_columns]
subset_title_comparison = title_comparison[['discipline'] + subset_columns]
#rename subset_columns to include 'title_' prefix and 'relative_' suffix
subset_title_discipline_stats.columns = ['discipline'] + ['title_' + col for col in subset_columns]
subset_title_comparison.columns = ['discipline'] + ['title_' + col + '_relative' for col in subset_columns]

subset_title_disciplines = pd.concat([subset_title_discipline_stats, subset_title_comparison], axis=1)

lineage_comparison = calculate_relative_difference(lineage_discipline_stats)

subset_lineage_discipline_stats = lineage_discipline_stats[['discipline'] + subset_columns]
subset_lineage_comparison = lineage_comparison[['discipline'] + subset_columns]
#rename subset_columns to include 'lineage_' prefix and 'relative_' suffix
subset_lineage_discipline_stats.columns = ['discipline'] + ['lineage_' + col for col in subset_columns]
subset_lineage_comparison.columns = ['discipline'] + ['lineage_' + col + '_relative' for col in subset_columns]

subset_lineage_disciplines = pd.concat([subset_lineage_discipline_stats, subset_lineage_comparison], axis=1)

# Ensure there's only one 'discipline' column in each dataframe
subset_title_disciplines = subset_title_disciplines.loc[:,~subset_title_disciplines.columns.duplicated()]
subset_lineage_disciplines = subset_lineage_disciplines.loc[:,~subset_lineage_disciplines.columns.duplicated()]

# Merge the dataframes
merged_disciplines = pd.merge(subset_title_disciplines, subset_lineage_disciplines, on='discipline')

In [41]:
merged_disciplines['discipline_category'] = merged_disciplines.discipline.map(discipline_categories)

In [42]:
title_discipline_category_comparison = calculate_relative_difference(title_discipline_category_stats)

subset_columns = ['mean_lifespan', 'mean_lifespan_cv', 'mean_lifespan_zscore', 'publication_count']
subset_title_discipline_category_stats = title_discipline_category_stats[['discipline_category'] + subset_columns]
subset_title_discipline_category_comparison = title_discipline_category_comparison[['discipline_category'] + subset_columns]
#rename subset_columns to include 'title_' prefix and 'relative_' suffix
subset_title_discipline_category_stats.columns = ['discipline_category'] + ['discipline_category_title_' + col for col in subset_columns]
subset_title_discipline_category_comparison.columns = ['discipline_category'] + ['discipline_category_title_' + col + '_relative' for col in subset_columns]

subset_title_discipline_category_disciplines = pd.concat([subset_title_discipline_category_stats, subset_title_discipline_category_comparison], axis=1)

lineage_discipline_category_comparison = calculate_relative_difference(lineage_discipline_category_stats)

subset_lineage_discipline_category_stats = lineage_discipline_category_stats[['discipline_category'] + subset_columns]

subset_lineage_discipline_category_comparison = lineage_discipline_category_comparison[['discipline_category'] + subset_columns]

#rename subset_columns to include 'lineage_' prefix and 'relative_' suffix
subset_lineage_discipline_category_stats.columns = ['discipline_category'] + ['discipline_category_lineage_' + col for col in subset_columns]
subset_lineage_discipline_category_comparison.columns = ['discipline_category'] + ['discipline_category_lineage_' + col + '_relative' for col in subset_columns]

subset_lineage_discipline_category_disciplines = pd.concat([subset_lineage_discipline_category_stats, subset_lineage_discipline_category_comparison], axis=1)

# Ensure there's only one 'discipline' column in each dataframe
subset_title_discipline_category_disciplines = subset_title_discipline_category_disciplines.loc[:,~subset_title_discipline_category_disciplines.columns.duplicated()]
subset_lineage_discipline_category_disciplines = subset_lineage_discipline_category_disciplines.loc[:,~subset_lineage_discipline_category_disciplines.columns.duplicated()]

# Merge the dataframes
merged_discipline_category_disciplines = pd.merge(subset_title_discipline_category_disciplines, subset_lineage_discipline_category_disciplines, on='discipline_category')


In [43]:
finalized_disciplines = merged_disciplines.merge(merged_discipline_category_disciplines, on=['discipline_category'], how="outer")
finalized_disciplines.to_csv('../data/derived_datasets/finalized_disciplines_metadata.csv', index=False)

In [44]:
total_lineage_values = exploded_lineage_disciplines.agg({
    'lineage_active_years': ['mean', 'std'],
}).reset_index().rename(columns={'index': 'lineage_statistic'})
total_title_values = exploded_titles_disciplines.agg({
    'title_active_years': ['mean', 'std'],
}).reset_index().rename(columns={'index': 'title_statistic'})

In [45]:
finalized_disciplines[finalized_disciplines.discipline.isin(['Garden & Landscape', 'Horticulture'])].title_mean_lifespan.mean()

32.07954545454545

In [46]:


def get_disciplinary_data(row, list_of_disciplines, finalized_disciplines, active_years, length_type):
    list_of_disciplines = [x.strip() for x in list_of_disciplines]
    subset_finalized_disciplines = finalized_disciplines[finalized_disciplines.discipline.isin(list_of_disciplines)]
    avg_disciplinary_lifespan = subset_finalized_disciplines[f'{length_type}_mean_lifespan'].mean()
    std_disciplinary_lifespan = subset_finalized_disciplines[f'{length_type}_mean_lifespan'].std()
    cv_disciplinary_lifespan = std_disciplinary_lifespan / avg_disciplinary_lifespan
    individual_compared_disciplinary_lifespan = active_years / avg_disciplinary_lifespan
    individual_std_disciplinary_lifespan = active_years / std_disciplinary_lifespan
    individual_cv_disciplinary_lifespan = individual_std_disciplinary_lifespan / individual_compared_disciplinary_lifespan
    individual_compared_cv_disciplinary_lifespan = individual_cv_disciplinary_lifespan / cv_disciplinary_lifespan

    avg_disciplinary_category_lifespan = subset_finalized_disciplines[f'discipline_category_{length_type}_mean_lifespan'].mean()
    std_disciplinary_category_lifespan = subset_finalized_disciplines[f'discipline_category_{length_type}_mean_lifespan'].std()
    cv_disciplinary_category_lifespan = std_disciplinary_category_lifespan / avg_disciplinary_category_lifespan
    individual_compared_disciplinary_category_lifespan = active_years / avg_disciplinary_category_lifespan
    individual_std_disciplinary_category_lifespan = active_years / std_disciplinary_category_lifespan
    individual_cv_disciplinary_category_lifespan = individual_std_disciplinary_category_lifespan / individual_compared_disciplinary_category_lifespan
    individual_compared_cv_disciplinary_category_lifespan = individual_cv_disciplinary_category_lifespan / cv_disciplinary_category_lifespan

    row[f"{length_type}_compared_disciplinary_lifespan"] = individual_compared_disciplinary_lifespan
    row[f"{length_type}_std_disciplinary_lifespan"] = individual_std_disciplinary_lifespan
    row[f"{length_type}_compared_cv_disciplinary_lifespan"] = individual_compared_cv_disciplinary_lifespan
    row[f"{length_type}_compared_disciplinary_category_lifespan"] = individual_compared_disciplinary_category_lifespan
    row[f"{length_type}_std_disciplinary_category_lifespan"] = individual_std_disciplinary_category_lifespan
    row[f"{length_type}_compared_cv_disciplinary_category_lifespan"] = individual_compared_cv_disciplinary_category_lifespan

    return row



def calculate_publication_profile(row, finalized_disciplines, total_values, length_type, discipline_column):
    list_disciplines = row[discipline_column]
    if isinstance(list_disciplines, str):
        list_disciplines = ast.literal_eval(list_disciplines)
    
    if type(list_disciplines) != list:
        return row
    individual_active_years = row[f'{length_type}_active_years']

    # Calculate z-score, CV, and relative difference for lineage_active_years
    total_values_mean = total_values[total_values[f"{length_type}_statistic"] == "mean"][f'{length_type}_active_years'].values[0]
    total_values_std = total_values[total_values[f"{length_type}_statistic"] == "std"][f'{length_type}_active_years'].values[0]
    total_zscore = (individual_active_years - total_values_mean) / total_values_std
    total_cv = total_values_std / total_values_mean
    total_relative_difference = individual_active_years / total_values_mean

    row[f'total_{length_type}_zscore'] = total_zscore
    row[f'total_{length_type}_cv'] = total_cv
    row[f'total_{length_type}_relative_difference'] = total_relative_difference

    row = get_disciplinary_data(row, list_disciplines, finalized_disciplines, individual_active_years, length_type)

    return row

copied_discipline_data = discipline_data.copy()
tqdm.pandas(desc="Calculating publication profiles")
copied_discipline_data = copied_discipline_data.progress_apply(lambda row: calculate_publication_profile(row, finalized_disciplines,total_title_values, 'title', 'discipline'), axis=1)
copied_discipline_data = copied_discipline_data.progress_apply(lambda row: calculate_publication_profile(row, finalized_disciplines, total_lineage_values, 'lineage', 'aggregated_disciplines'), axis=1)

Calculating publication profiles:   0%|          | 0/4521 [00:00<?, ?it/s]

Calculating publication profiles: 100%|██████████| 4521/4521 [00:14<00:00, 318.68it/s]
Calculating publication profiles: 100%|██████████| 4521/4521 [00:12<00:00, 362.55it/s]


In [86]:
# copied_discipline_data.to_csv('../data/derived_datasets/finalized_publication_metadata.csv', index=False)