# %% [markdown]

 # Compilation of All Counts for <font color=DodgerBlue><u>*Original Corpora*</u></font> & <font color=Brown><u>***BiRC***</u></font>
 ___`BiRC`___ $=\text{``bigram restricted corpus'' or }\texttt{bigram\_subset}$

# %% [markdown]

 ### *imports*

In [None]:
from pathlib import Path
import pandas as pd
from am_notebooks import *

#! Toggle TESTING
TESTING = False
if TESTING:
    print(
        '🚨⚠️ WARNING!\n  This is a **TEST** run!'
        'Full count data will not be loaded or saved\n'
    )

# // from utils import file_size_round
SUBSET_DATA_DIR = Path('/share/compling/data/sanpi/subsets')
print(
    '==============================\n'
    f'Date: {pd.Timestamp.now().ctime()}\n'
    '=============================='
)

BIRC_INFO_DIR = SANPI_HOME.joinpath('info/BiRC')
confirm_dir(BIRC_INFO_DIR)
BIRC_META_CSV = BIRC_INFO_DIR.joinpath('meta-info-full.csv')
REL_BIRC_TEX_DIR ='ch6/BiRC'
BIRC_TEX_DIR = LATEX_TABLES.joinpath(REL_BIRC_TEX_DIR)
confirm_dir(BIRC_TEX_DIR)

Date: Mon Feb 17 00:48:14 2025


# %% [markdown]

 ## **Define functions** to compile all the disparate meta info...

In [None]:
def collect_subset_path_index():
    """Collect subset path index information.

    This function gathers information about subset paths, including parent directory,
    name, full path, source path (from symlink), BiRC subcorpus, size, and other
    relevant details. It then processes this information to create a DataFrame with
    additional derived columns, such as 'init_conllu', 'data_key', 'corpus_part',
    'corpus', 'subset_info_dir', and 'path_index_csv'.

    Returns:
        pd.DataFrame: A DataFrame containing the collected subset path index info.
    """
    data = {p.stem:
            {
                'parent': p.parent.name,
                'name': p.name,
                'path': p,
                'source_path': p.readlink(),
                'birc_subcorpus': p.parent.parent.stem,
                'is_link': p.is_symlink(),
                # 'size': file_size_round(p.stat().st_size),
                'size': p.stat().st_size,  # ! no longer forcing into different units
            }
            for p in SUBSET_DATA_DIR.rglob('*.conllu')
            #! to prevent *DEMO* files being included:
            if not p.parent.name.startswith('bigram-DEMO')
            }
    bp_df = pd.DataFrame.from_dict(data, orient='index').convert_dtypes()
    bp_df
    bp_df['init_conllu'] = bp_df.name.str.replace('BIGRAM.', '', regex=False)
    bp_df['data_key'] = bp_df.index.str.replace('BIGRAM.', '', regex=False)

    bp_df['corpus_part'], bp_df['corpus'], bp_df['subset_info_dir'] = zip(
        *bp_df.source_path.apply(
            lambda sp: (sp.parent.parent.stem,
                        sp.parent.parent.parent.stem,
                        sp.parent.joinpath('info')))
    )
    bp_df = bp_df.convert_dtypes()

    set_my_style(bp_df.sample(1).T)

    bp_df['path_index_csv'] = bp_df.subset_info_dir.apply(
        lambda i: max(i.glob('subset-bigram_path-index*csv'),
                      key=lambda file: file.stat().st_ctime))

    try:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', ' INPUT_COUNTS',
                                         ' SUBSET_COUNTS']
                                ).set_index('STEM')
    except ValueError:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', 'INPUT_COUNTS',
                                         'SUBSET_COUNTS']
                                ).set_index('STEM')

    path_info.columns = path_info.columns.str.strip().str.lower()
    path_info.columns

    for stem in path_info.sample(3).T:
        print(f'- {stem}')
        print(f'  - input_counts = {path_info.input_counts[stem]}')
        print(f'  - subset_counts = {path_info.subset_counts[stem]}')

    bp_df.index.name = 'subset_stem'
    return bp_df.reset_index().set_index('data_key')


def collect_meta_info(_bp_df):
    """Collect metadata information from various sources.

    This function groups the input DataFrame by 'path_index_csv', reads data from
    corresponding CSV files, joins the data with the original DataFrame, and
    concatenates the results into a single DataFrame. It also performs some
    string cleaning on the resulting DataFrame's columns.

    Args:
        _bp_df (pd.DataFrame): DataFrame containing file paths and other info.

    Returns:
        pd.DataFrame: A DataFrame containing the combined metadata information.
    """
    subframes = []
    for path_ix, df in _bp_df.groupby('path_index_csv'):
        print(path_ix)
        if 'DEMO' in str(path_ix):
            continue
        try:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', ' INPUT_COUNTS', ' SUBSET_COUNTS']
            ).set_index('STEM')
        except ValueError:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', 'INPUT_COUNTS', 'SUBSET_COUNTS']
            ).set_index('STEM')
        display(set_my_style(df.head(1).T, caption=str(
            path_ix.relative_to('/share/compling/data'))))
        subframes.append(df.join(path_info))

    meta_df = pd.concat(subframes)
    meta_df.iloc[:, -2:] = meta_df.iloc[:, -2:].apply(lambda x: x.str.strip())
    meta_df.columns = meta_df.columns.str.strip().str.lower()
    return meta_df

#  %% [markdown]

 ## **Create or read meta info**

In [None]:
if BIRC_META_CSV.is_file():
    meta_df = pd.read_csv(BIRC_META_CSV, index_col='data_key').convert_dtypes()
    print(f'Meta Info DataFrame loaded from "{BIRC_META_CSV}"')
else:
    bp_df = collect_subset_path_index()
    meta_df = collect_meta_info(bp_df)
    meta_df.to_csv(BIRC_META_CSV)

set_my_style(
    meta_df.head(1).T,
    caption='First Line of Meta DataFrame')

Meta Info DataFrame loaded from "/share/compling/projects/sanpi/info/BiRC/meta-info-full.csv"


data_key,apw_eng_199411
subset_stem,BIGRAM.apw_eng_199411
parent,bigram-Apw
name,BIGRAM.apw_eng_199411.conllu
path,/share/compling/data/sanpi/subsets/bigram_news/bigram-Apw/BIGRAM.apw_eng_199411.conllu
source_path,/share/compling/data/news/Apw.conll/subset_bigram/BIGRAM.apw_eng_199411.conllu
birc_subcorpus,bigram_news
is_link,True
size,11162354
init_conllu,apw_eng_199411.conllu
corpus_part,Apw


# %% [markdown]

 ## *Define functions* to retrieve count data

In [None]:
def load_totals(counts_path):
    """Load total counts from a JSON file.

    This function reads a JSON file containing counts data, extracts the 'total'
    column, drops any missing values (NaN), and returns the result as a dictionary.
    If the provided path is not absolute, it prepends the sanpi data directory.

    Args:
        counts_path (str or Path): Path to the JSON file.

    Returns:
        dict: A dictionary containing the total counts, with keys corresponding to
            the original index of the 'total' column.
    """

    if not Path(counts_path).is_absolute():
        counts_path = f"/share/compling/data/sanpi/{counts_path}"
    counts_df = pd.read_json(counts_path)
    return counts_df.loc[:, 'total'].dropna().to_dict()


def generate_counts(_meta_df, retrieval_key='input'):
    """Generate counts for each data key.

    This generator iterates through the index of the input DataFrame, retrieves the
    counts path based on the retrieval key, and yields the data key along with its
    corresponding counts.

    Args:
        _meta_df (pd.DataFrame): DataFrame containing metadata information.
        retrieval_key (str, optional): Key used to retrieve the counts path.
            Defaults to 'input'.

    Yields:
        tuple: A tuple containing the data key and its corresponding counts.
    """
    for data_key in _meta_df.index:
        counts_path = _meta_df.at[data_key, f'{retrieval_key}_counts']
        yield data_key, load_totals(counts_path)


def retrieve_count_data(_counts_csv_path, _meta_df, test=False):
    seek_birc = "birc" in _counts_csv_path.name.lower()
    _counts_label = "BiRC" if seek_birc else "Original"
    if _counts_csv_path.is_file():
        try:
            _counts_df = pd.read_csv(_counts_csv_path,
                                     index_col='corpus_slice').convert_dtypes()
        except ValueError:
            _counts_df = pd.read_csv(_counts_csv_path).convert_dtypes()

        print(
            f'**{_counts_label} Counts** read from csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
    else:
        _meta_selection = _meta_df.copy().sample(100) if test else _meta_df.copy()
        _counts_df = pd.DataFrame.from_dict(
            {k: c for k, c in generate_counts(
                _meta_df=_meta_selection,
                retrieval_key='subset' if seek_birc else 'input')},
            orient='index')
        _counts_df.index.name = 'corpus_slice'
        _counts_df = _counts_df.reset_index().convert_dtypes().set_index('corpus_slice')
        print(
            f'**{_counts_label} Counts** saved as csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
        if test:
            print('TESTING (so, not really)')
        else:
            _counts_df.to_csv(_counts_csv_path)
    return _counts_df.loc[:, ~_counts_df.columns.str.startswith('NR_')]

# %% [markdown]

 ## **Load <font color=DodgerBlue><u>Original</u></font> counts** (if not previously collected)

In [None]:
orig_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('original-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
)

orig_counts_df.info()

**Original Counts** read from csv
> Path: info/BiRC/original-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
dtypes: Float64(1), Int64(14)
memory usage: 505.1+ KB


# %% [markdown]

 ## **...and <font color=Brown><u>BiRC</u></font> counts** (likewise)

In [None]:
birc_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('birc-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
)

birc_counts_df.info()
# if birc_counts_composite_csv.is_file():
#     birc_counts_df = pd.read_csv(birc_counts_composite_csv).convert_dtypes()
#     print(
#         f'**BiRC Counts** read from csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')
# else:
#     birc_counts_df = pd.DataFrame.from_dict(
#         # {k:c for k,c in generate_counts(meta_df.head(500), 'subset')}, orient='index')
#         {k: c for k, c in generate_counts(meta_df, 'subset')}, orient='index')
#     pd.to_csv(birc_counts_composite_csv)
#     print(
#         f'**BiRC Counts** saved as csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')

**BiRC Counts** read from csv
> Path: info/BiRC/birc-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
dtypes: Float64(1), Int64(14)
memory usage: 505.1+ KB


# %% [markdown]

 ## *Describe Count Collections*

In [None]:
save_latex_table(orig_counts_df.describe().T.iloc[:, 1:].assign(Total=orig_counts_df.sum()).convert_dtypes(),
                 caption='Original Corpora: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='init-descrip-stats-orig')

save_latex_table(birc_counts_df.describe().T.iloc[:, 1:].assign(Total=birc_counts_df.sum()).convert_dtypes(),
                 caption='BiRC: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='init-descrip-stats-birc')

Caption: Original Corpora: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
file\_MB,376.642759,48.076895,1.16,376.87,381.31,385.88,693.62,1362316.86
sentences,264905.37,67510.85,1147.0,249191.0,252229.0,255672.0,820724.0,958162724.0
tokens,7963980.62,1390308.39,27017.0,7746686.0,7844694.0,7950766.0,19532928.0,28805717900.0
ADV\_tokens,211630.73431,39606.499583,840.0,204460.0,207234.0,210390.0,570729.0,765468366.0
ADV\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0
ADV\_lemmas,2373.409732,229.140218,185.0,2347.0,2380.0,2416.0,3473.0,8584623.0
ADV\_forms,3022.908211,305.797531,195.0,3036.0,3076.0,3117.0,3808.0,10933859.0
ADJ\_tokens,344358.620957,89210.573882,1809.0,321416.0,325172.0,329259.0,1076937.0,1245545132.0
ADJ\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/init-descrip-stats-orig.2025-02-17.tex

Caption: BiRC: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
file\_MB,48.411269,7.827544,0.09,49.26,50.12,50.93,76.52,175103.56
sentences,24564.02,4727.44,78.0,23867.0,24232.0,24603.0,66997.0,88848070.0
tokens,744482.66,115699.45,1836.0,741653.0,753252.0,765219.0,1591526.0,2692793781.0
ADV\_tokens,51777.230301,9319.391049,142.0,50814.0,51673.0,52575.0,128678.0,187278242.0
ADV\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0
ADV\_lemmas,1345.696434,150.913612,63.0,1338.0,1361.0,1384.0,1963.0,4867384.0
ADV\_forms,1653.0564,197.147092,66.0,1665.0,1692.0,1718.0,2107.0,5979105.0
ADJ\_tokens,63886.013547,12921.907138,213.0,61758.0,62800.0,63867.0,177632.0,231075711.0
ADJ\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/init-descrip-stats-birc.2025-02-17.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/init-descrip-stats-birc.2025-02-17.tex')

In [None]:
# %%

def add_rate_cols(_df):
    return _df.assign(
        tok_per_sent=_df.tokens / _df.sentences,
        ADV_tok_per_sent=_df.ADV_tokens / _df.sentences,
        ADJ_tok_per_sent=_df.ADJ_tokens / _df.sentences,
        NEG_tok_per_sent=_df.NEG_tokens / _df.sentences,
        NEG_tok_per_mill=(_df.NEG_tokens / _df.tokens)*(10**6),
        ADV_tok_per_mill=(_df.ADV_tokens / _df.tokens)*(10**6),
        ADJ_tok_per_mill=(_df.ADJ_tokens / _df.tokens)*(10**6),
        ADV_form_per_lemma=_df.ADV_forms / _df.ADV_lemmas,
        # ADV_lemma_per_form= _df.ADV_lemmas / _df.ADV_forms,
        ADV_tok_per_lemma=_df.ADV_tokens / _df.ADV_lemmas,
        ADV_tok_per_form=_df.ADV_tokens / _df.ADV_forms,
        ADJ_form_per_lemma=_df.ADJ_forms / _df.ADJ_lemmas,
        # ADJ_lemma_per_form= _df.ADJ_lemmas / _df.ADJ_forms,
        ADJ_tok_per_lemma=_df.ADJ_tokens / _df.ADJ_lemmas,
        ADJ_tok_per_form=_df.ADJ_tokens / _df.ADJ_forms,
        # NEG_lemma_per_form= _df.NEG_lemmas / _df.NEG_forms,
        NEG_form_per_lemma=_df.NEG_forms / _df.NEG_lemmas,
    )


orig_counts_df = add_rate_cols(orig_counts_df)
birc_counts_df = add_rate_cols(birc_counts_df)
samix = orig_counts_df.sample(4).sort_index().index
nb_display(set_my_style(orig_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for Original Counts'))
nb_display(set_my_style(birc_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for BiRC Counts'))

corpus_slice,apw_eng_200808,pcc_eng_07-019,pcc_eng_20-020,pcc_eng_23-094
tok_per_sent,23.73,31.12,31.15,31.11
ADV_tok_per_sent,0.6,0.82,0.81,0.8
ADJ_tok_per_sent,1.36,1.29,1.28,1.28
NEG_tok_per_sent,0.14,0.17,0.17,0.16
NEG_tok_per_mill,5715.24,5425.61,5323.12,5246.17
ADV_tok_per_mill,25319.36,26479.48,25975.99,25830.36
ADJ_tok_per_mill,57441.8,41434.51,40947.24,41186.52
ADV_form_per_lemma,1.1,1.29,1.3,1.3
ADV_tok_per_lemma,139.44,86.89,87.78,87.24
ADV_tok_per_form,126.69,67.12,67.44,67.04


corpus_slice,apw_eng_200808,pcc_eng_07-019,pcc_eng_20-020,pcc_eng_23-094
tok_per_sent,23.64,31.14,31.13,31.13
ADV_tok_per_sent,1.8,2.14,2.15,2.12
ADJ_tok_per_sent,2.59,2.59,2.62,2.58
NEG_tok_per_sent,0.21,0.32,0.32,0.31
NEG_tok_per_mill,9026.42,10389.59,10416.47,10038.99
ADV_tok_per_mill,76281.28,68704.96,69034.44,67957.2
ADJ_tok_per_mill,109338.03,83200.09,84216.69,82974.7
ADV_form_per_lemma,1.08,1.25,1.26,1.26
ADV_tok_per_lemma,48.93,37.32,38.04,38.32
ADV_tok_per_form,45.38,29.95,30.29,30.5


In [None]:
# %%

save_latex_table(orig_counts_df.describe().T.iloc[:, 1:].assign(Total=orig_counts_df.sum()).convert_dtypes(),
                 caption='Original Corpora Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='plus-rates-descrip-stats-orig')

save_latex_table(birc_counts_df.describe().T.iloc[:, 1:].assign(Total=birc_counts_df.sum()).convert_dtypes(),
                 caption='BiRC Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='plus-rates-descrip-stats-birc')

save_latex_table(orig_counts_df.filter(like='per').describe().T.iloc[:, 1:].assign(Total=orig_counts_df.sum()).convert_dtypes(),
                 caption='Original Corpora Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='only-rates-descrip-stats-orig')

save_latex_table(birc_counts_df.filter(like='per').describe().T.iloc[:, 1:].assign(Total=birc_counts_df.sum()).convert_dtypes(),
                 caption='BiRC Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='only-rates-descrip-stats-birc')

Caption: Original Corpora Counts & Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
file\_MB,376.642759,48.076895,1.16,376.87,381.31,385.88,693.62,1362316.86
sentences,264905.37,67510.85,1147.0,249191.0,252229.0,255672.0,820724.0,958162724.0
tokens,7963980.62,1390308.39,27017.0,7746686.0,7844694.0,7950766.0,19532928.0,28805717900.0
ADV\_tokens,211630.73431,39606.499583,840.0,204460.0,207234.0,210390.0,570729.0,765468366.0
ADV\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0
ADV\_lemmas,2373.409732,229.140218,185.0,2347.0,2380.0,2416.0,3473.0,8584623.0
ADV\_forms,3022.908211,305.797531,195.0,3036.0,3076.0,3117.0,3808.0,10933859.0
ADJ\_tokens,344358.620957,89210.573882,1809.0,321416.0,325172.0,329259.0,1076937.0,1245545132.0
ADJ\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-descrip-stats-orig.2025-02-17.tex

Caption: BiRC Counts & Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
file\_MB,48.411269,7.827544,0.09,49.26,50.12,50.93,76.52,175103.56
sentences,24564.02,4727.44,78.0,23867.0,24232.0,24603.0,66997.0,88848070.0
tokens,744482.66,115699.45,1836.0,741653.0,753252.0,765219.0,1591526.0,2692793781.0
ADV\_tokens,51777.230301,9319.391049,142.0,50814.0,51673.0,52575.0,128678.0,187278242.0
ADV\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0
ADV\_lemmas,1345.696434,150.913612,63.0,1338.0,1361.0,1384.0,1963.0,4867384.0
ADV\_forms,1653.0564,197.147092,66.0,1665.0,1692.0,1718.0,2107.0,5979105.0
ADJ\_tokens,63886.013547,12921.907138,213.0,61758.0,62800.0,63867.0,177632.0,231075711.0
ADJ\_xpos,3.0,0.0,3.0,3.0,3.0,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-descrip-stats-birc.2025-02-17.tex

Caption: Original Corpora Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
tok\_per\_sent,30.375293,2.297509,1.744827,31.112055,31.125939,31.139396,32.176999,109867.434803
ADV\_tok\_per\_sent,0.806727,0.059577,0.389156,0.815377,0.821356,0.826835,0.886415,2917.932666
ADJ\_tok\_per\_sent,1.299893,0.052521,0.952974,1.281407,1.289703,1.298582,1.648766,4701.714696
NEG\_tok\_per\_sent,0.167069,0.010113,0.092644,0.166405,0.16797,0.1696,0.223678,604.289089
NEG\_tok\_per\_mill,5551.801189,1609.759242,3929.536285,5353.658904,5402.466263,5459.674683,96553.464313,20080864.900171
ADV\_tok\_per\_mill,26718.41561,7713.049844,16506.162419,26229.799103,26415.459711,26606.826809,479278.349333,96640509.261629
ADJ\_tok\_per\_mill,43374.635341,12844.330364,37574.922799,41162.635743,41442.631915,41755.357843,737301.723841,156886056.027471
ADV\_form\_per\_lemma,1.272915,0.059846,1.054054,1.284886,1.291615,1.297678,1.327456,4604.135303
ADV\_tok\_per\_lemma,88.876853,11.642171,4.540541,85.791439,87.1162,88.454077,176.096575,321467.575759


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-descrip-stats-orig.2025-02-17.tex

Caption: BiRC Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
tok\_per\_sent,30.390398,2.264841,23.524093,31.125878,31.139457,31.152291,32.195978,109922.067876
ADV\_tok\_per\_sent,2.107402,0.093244,1.581904,2.121257,2.1333,2.143543,2.228142,7622.473013
ADJ\_tok\_per\_sent,2.598741,0.043692,2.271796,2.580042,2.592927,2.607428,3.075629,9399.644986
NEG\_tok\_per\_sent,0.311711,0.030635,0.163388,0.316392,0.320608,0.324286,0.363451,1127.45942
NEG\_tok\_per\_mill,10238.704921,448.389591,6901.384613,10184.950732,10311.474769,10426.156677,13071.895425,37033395.700265
ADV\_tok\_per\_mill,69566.598296,3391.553055,65676.41422,68286.559501,68617.616136,69000.780287,84889.384072,251622386.03512
ADJ\_tok\_per\_mill,86166.7605,8924.42402,79799.685689,82862.948134,83283.729315,83784.047329,120159.50458,311665172.727584
ADV\_form\_per\_lemma,1.226534,0.052351,1.034682,1.234364,1.242447,1.249267,1.297749,4436.37484
ADV\_tok\_per\_lemma,38.281514,4.180145,2.253968,37.363436,38.022828,38.710156,73.029512,138464.237438


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-descrip-stats-birc.2025-02-17.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-rates-descrip-stats-birc.2025-02-17.tex')

# %% [markdown]

 ## *Calculate **Reduction*** (<code>BiRC - Full</code>)

In [None]:
reduction_df = birc_counts_df - orig_counts_df
# iloc[:, 1:] drops the uninformtative `count` column
save_latex_table(reduction_df.describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
                 caption='Reduction (BiRC -- Full) Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='plus-rates-birc-minus-full-descrip-stats')
save_latex_table(reduction_df.filter(like='per').describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
                 caption='Reduction (BiRC -- Full) Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='only-rates-birc-minus-full-descrip-stats')
# nb_display(set_my_style(reduction_df.describe().T.assign(Total=reduction_df.sum()).convert_dtypes(),
#                         caption='BiRC <i>Reduction</i>: Descriptive Stats', precision=1))
nb_display(set_my_style(reduction_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for <code>BiRC - Full</code> counts'))

Caption: Reduction (BiRC -- Full) Counts & Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
file\_MB,-328.23149,42.014842,-617.1,-335.07,-331.15,-327.31,-1.07,-1187213.3
sentences,-240341.35,63751.01,-753727.0,-231181.0,-228003.0,-225298.0,-1069.0,-869314654.0
tokens,-7219497.96,1307642.67,-17941402.0,-7188027.0,-7090899.0,-7001426.0,54335.0,-26112924119.0
ADV\_tokens,-159853.504009,31201.99475,-442051.0,-157847.0,-155494.0,-153469.0,-698.0,-578190124.0
ADV\_xpos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV\_lemmas,-1027.713298,95.083899,-1627.0,-1048.0,-1020.0,-994.0,-122.0,-3717239.0
ADV\_forms,-1369.851811,118.837295,-1816.0,-1417.0,-1384.0,-1354.0,-129.0,-4954754.0
ADJ\_tokens,-280472.607409,79264.142497,-899305.0,-265507.0,-262280.0,-259415.0,-1596.0,-1014469421.0
ADJ\_xpos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-birc-minus-full-descrip-stats.2025-02-17.tex

Caption: Reduction (BiRC -- Full) Rates: Descriptive Stats


Unnamed: 0,mean,std,min,25\%,50\%,75\%,max,Total
,,,,,,,,
tok\_per\_sent,0.015105,0.489739,-0.207146,0.00346,0.012637,0.019983,29.417698,54.633073
ADV\_tok\_per\_sent,1.300675,0.044423,1.064536,1.301333,1.311478,1.319974,1.387743,4704.540347
ADJ\_tok\_per\_sent,1.298847,0.045074,0.975717,1.291274,1.302782,1.314372,1.73009,4697.93029
NEG\_tok\_per\_sent,0.144642,0.026795,0.031708,0.149225,0.152656,0.155844,0.18823,523.170331
NEG\_tok\_per\_mill,4686.903732,1654.886587,-86285.862127,4784.877305,4898.73027,5000.881611,6033.782075,16952530.800093
ADV\_tok\_per\_mill,42848.182686,7973.780074,-410894.976566,41916.697892,42204.343297,42540.810184,55763.223206,154981876.773492
ADJ\_tok\_per\_mill,42792.125159,12211.102686,-654685.66771,41526.856564,41888.583624,42320.838324,63876.205028,154779116.700113
ADV\_form\_per\_lemma,-0.046381,0.01233,-0.082414,-0.054937,-0.047559,-0.040157,0.002536,-167.760463
ADV\_tok\_per\_lemma,-50.595338,8.04972,-104.071833,-50.176603,-49.099362,-48.120528,-2.286572,-183003.338321


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-birc-minus-full-descrip-stats.2025-02-17.tex



corpus_slice,apw_eng_200808,pcc_eng_07-019,pcc_eng_20-020,pcc_eng_23-094
tok_per_sent,-0.09,0.02,-0.02,0.01
ADV_tok_per_sent,1.2,1.32,1.34,1.31
ADJ_tok_per_sent,1.22,1.3,1.35,1.3
NEG_tok_per_sent,0.08,0.15,0.16,0.15
NEG_tok_per_mill,3311.18,4963.98,5093.35,4792.82
ADV_tok_per_mill,50961.92,42225.48,43058.45,42126.84
ADJ_tok_per_mill,51896.24,41765.58,43269.45,41788.18
ADV_form_per_lemma,-0.02,-0.05,-0.05,-0.04
ADV_tok_per_lemma,-90.51,-49.57,-49.74,-48.92
ADV_tok_per_form,-81.31,-37.17,-37.15,-36.54


In [None]:
# %%

def reconfigure_counts(_counts_df, count_kind: str):
    return _counts_df.assign(
        kind=count_kind
    ).reset_index().set_index(['kind', 'corpus_slice']).unstack('kind')


orig_s = reconfigure_counts(orig_counts_df, count_kind='Full')
birc_s = reconfigure_counts(birc_counts_df, 'BiRC')
diff_s = reconfigure_counts(reduction_df, 'diff')
juxta = orig_s.join(birc_s).join(diff_s).convert_dtypes().sort_index(axis=1)

# juxta = orig_counts_df.join(birc_counts_df, rsuffix=':BiRC', lsuffix=':Full').join(reduction_df.add_suffix(':diff')).convert_dtypes()
juxta_desc = juxta.describe().iloc[1:, :]
juxta_desc.loc['TOTAL', :] = juxta.sum()
juxta_desc.index = juxta_desc.index.str.upper()
juxta_desc.loc['CV%', :] = (
    (juxta_desc.T.STD / juxta_desc.T.MEAN).fillna(0) * 100)
save_latex_table(juxta_desc.T.convert_dtypes(),
                 caption='Juxtaposed BiRC: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='birc-juxtaposed-descrip-stats-complete')

juxta = pd.concat([juxta, juxta_desc])
# juxtaT = juxta.T.assign(TOTAL=juxta.sum(), MEAN=juxta.mean(), MEDIAN=juxta.median(),
#                         MAX=juxta.max(), MIN=juxta.min(),
#                         STD=juxta.std())
# juxta = juxtaT.sort_index().T.sort_index()
juxta = juxta.sort_index().sort_index(axis=1)

Caption: Juxtaposed BiRC: Descriptive Stats


Unnamed: 0_level_0,Unnamed: 1_level_0,MEAN,STD,MIN,25\%,50\%,75\%,MAX,TOTAL,CV\%
Unnamed: 0_level_1,Kind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADJ\_form\_per\_lemma,BiRC,1.1,0.03,1.0,1.1,1.11,1.11,1.13,3969.33,2.43
ADJ\_form\_per\_lemma,Full,1.14,0.04,1.01,1.15,1.15,1.15,1.21,4107.78,3.19
ADJ\_form\_per\_lemma,diff,-0.04,0.01,-0.09,-0.04,-0.04,-0.04,-0.0,-138.45,-26.94
ADJ\_forms,BiRC,6928.6,1027.29,145.0,6788.0,6867.0,6955.0,12550.0,25060731.0,14.83
ADJ\_forms,Full,18132.96,3441.84,653.0,17249.0,17393.0,17553.0,40339.0,65586917.0,18.98
ADJ\_forms,diff,-11204.36,2636.3,-27789.0,-10649.0,-10531.0,-10431.0,-508.0,-40526186.0,-23.53
ADJ\_lemmas,BiRC,6321.67,1047.61,145.0,6135.0,6205.0,6287.0,12298.0,22865464.0,16.57
ADJ\_lemmas,Full,16051.86,3710.77,648.0,15030.0,15164.0,15316.0,39164.0,58059580.0,23.12
ADJ\_lemmas,diff,-9730.2,2852.52,-26866.0,-9058.0,-8951.0,-8862.0,-503.0,-35194116.0,-29.32
ADJ\_tok\_per\_form,BiRC,9.18,0.81,1.47,9.05,9.14,9.24,15.08,33199.51,8.82


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/birc-juxtaposed-descrip-stats-complete.2025-02-17.tex



In [None]:
# %%

save_latex_table(
    pd.concat([
        juxta.filter(like='apw', axis=0).sample(1), 
        juxta.filter(like='nyt', axis=0).sample(1),
        # 👆 to ensure at least one sample of apw  and news are included
        juxta.iloc[9:, :].sample(15)
    ]).drop_duplicates().sort_index().T,
    caption=('Sample of Juxtaposed Counts'),
    verbose=True,
    latex_subdir=REL_BIRC_TEX_DIR,
    latex_stem='birc-juxtaposed-sample')

Caption: Sample of Juxtaposed Counts


Unnamed: 0,Unnamed: 1,apw\_eng\_200604,nyt\_eng\_199409,pcc\_eng\_01-052,pcc\_eng\_03-098,pcc\_eng\_05-024,pcc\_eng\_07-011,pcc\_eng\_08-055,pcc\_eng\_14-017,pcc\_eng\_15-069,pcc\_eng\_16-029,pcc\_eng\_17-095,pcc\_eng\_18-072,pcc\_eng\_22-092,pcc\_eng\_24-052,pcc\_eng\_26-003,pcc\_eng\_26-078,pcc\_eng\_28-087
,,,,,,,,,,,,,,,,,,
ADJ\_form\_per\_lemma,BiRC,1.02,1.02,1.1,1.11,1.1,1.11,1.11,1.11,1.11,1.1,1.11,1.11,1.1,1.11,1.1,1.11,1.11
ADJ\_form\_per\_lemma,Full,1.03,1.03,1.15,1.15,1.14,1.15,1.15,1.15,1.15,1.14,1.15,1.15,1.15,1.15,1.15,1.15,1.14
ADJ\_form\_per\_lemma,diff,-0.0,-0.01,-0.05,-0.05,-0.04,-0.05,-0.04,-0.04,-0.04,-0.04,-0.04,-0.04,-0.04,-0.04,-0.05,-0.04,-0.04
ADJ\_forms,BiRC,5044.0,9064.0,6897.0,6870.0,6965.0,6834.0,6756.0,6964.0,6961.0,6932.0,7143.0,6829.0,6825.0,6967.0,6780.0,6723.0,7088.0
ADJ\_forms,Full,19007.0,25874.0,17600.0,17277.0,17684.0,17337.0,17422.0,17413.0,17354.0,17362.0,17628.0,17417.0,17163.0,17229.0,17308.0,17172.0,17562.0
ADJ\_forms,diff,-13963.0,-16810.0,-10703.0,-10407.0,-10719.0,-10503.0,-10666.0,-10449.0,-10393.0,-10430.0,-10485.0,-10588.0,-10338.0,-10262.0,-10528.0,-10449.0,-10474.0
ADJ\_lemmas,BiRC,4924.0,8907.0,6258.0,6211.0,6330.0,6173.0,6081.0,6283.0,6278.0,6275.0,6431.0,6165.0,6180.0,6303.0,6146.0,6052.0,6410.0
ADJ\_lemmas,Full,18469.0,25233.0,15310.0,15003.0,15445.0,15045.0,15153.0,15171.0,15136.0,15164.0,15331.0,15211.0,14964.0,15031.0,15018.0,14935.0,15341.0
ADJ\_lemmas,diff,-13545.0,-16326.0,-9052.0,-8792.0,-9115.0,-8872.0,-9072.0,-8888.0,-8858.0,-8889.0,-8900.0,-9046.0,-8784.0,-8728.0,-8872.0,-8883.0,-8931.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/birc-juxtaposed-sample.2025-02-17.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/birc-juxtaposed-sample.2025-02-17.tex')