# Compilation of All Counts for <font color=DodgerBlue><u>*Original Corpora*</u></font> & <font color=Brown><u>***BiRC***</u></font>
___`BiRC`___ $=\text{``bigram restricted corpus'' or }\texttt{bigram\_subset}$

### *imports*

In [1]:
from pathlib import Path
import pandas as pd
from am_notebooks import *

#! Toggle TESTING
TESTING = False
if TESTING:
    print(
        '🚨⚠️ WARNING!\n  This is a **TEST** run!'
        'Full count data will not be loaded or saved\n'
    )

# // from utils import file_size_round
SUBSET_DATA_DIR = Path('/share/compling/data/sanpi/subsets')
print(
    '==============================\n'
    f'Date: {pd.Timestamp.now().ctime()}\n'
    '=============================='
)

BIRC_INFO_DIR = SANPI_HOME.joinpath('info/BiRC')
confirm_dir(BIRC_INFO_DIR)
BIRC_META_CSV = BIRC_INFO_DIR.joinpath('meta-info-full.csv')
REL_BIRC_TEX_DIR ='ch6/BiRC'
BIRC_TEX_DIR = LATEX_TABLES.joinpath(REL_BIRC_TEX_DIR)
confirm_dir(BIRC_TEX_DIR)

Date: Tue Feb 18 00:19:32 2025




 ## **Define functions** to compile all the disparate meta info...

In [2]:
def collect_subset_path_index():
    """Collect subset path index information.

    This function gathers information about subset paths, including parent directory,
    name, full path, source path (from symlink), BiRC subcorpus, size, and other
    relevant details. It then processes this information to create a DataFrame with
    additional derived columns, such as 'init_conllu', 'data_key', 'corpus_part',
    'corpus', 'subset_info_dir', and 'path_index_csv'.

    Returns:
        pd.DataFrame: A DataFrame containing the collected subset path index info.
    """
    data = {p.stem:
            {
                'parent': p.parent.name,
                'name': p.name,
                'path': p,
                'source_path': p.readlink(),
                'birc_subcorpus': p.parent.parent.stem,
                'is_link': p.is_symlink(),
                # 'size': file_size_round(p.stat().st_size),
                'size': p.stat().st_size,  # ! no longer forcing into different units
            }
            for p in SUBSET_DATA_DIR.rglob('*.conllu')
            #! to prevent *DEMO* files being included:
            if not p.parent.name.startswith('bigram-DEMO')
            }
    bp_df = pd.DataFrame.from_dict(data, orient='index').convert_dtypes()
    bp_df
    bp_df['init_conllu'] = bp_df.name.str.replace('BIGRAM.', '', regex=False)
    bp_df['data_key'] = bp_df.index.str.replace('BIGRAM.', '', regex=False)

    bp_df['corpus_part'], bp_df['corpus'], bp_df['subset_info_dir'] = zip(
        *bp_df.source_path.apply(
            lambda sp: (sp.parent.parent.stem,
                        sp.parent.parent.parent.stem,
                        sp.parent.joinpath('info')))
    )
    bp_df = bp_df.convert_dtypes()

    set_my_style(bp_df.sample(1).T)

    bp_df['path_index_csv'] = bp_df.subset_info_dir.apply(
        lambda i: max(i.glob('subset-bigram_path-index*csv'),
                      key=lambda file: file.stat().st_ctime))

    try:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', ' INPUT_COUNTS',
                                         ' SUBSET_COUNTS']
                                ).set_index('STEM')
    except ValueError:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', 'INPUT_COUNTS',
                                         'SUBSET_COUNTS']
                                ).set_index('STEM')

    path_info.columns = path_info.columns.str.strip().str.lower()
    path_info.columns

    for stem in path_info.sample(3).T:
        print(f'- {stem}')
        print(f'  - input_counts = {path_info.input_counts[stem]}')
        print(f'  - subset_counts = {path_info.subset_counts[stem]}')

    bp_df.index.name = 'subset_stem'
    return bp_df.reset_index().set_index('data_key')


def collect_meta_info(_bp_df):
    """Collect metadata information from various sources.

    This function groups the input DataFrame by 'path_index_csv', reads data from
    corresponding CSV files, joins the data with the original DataFrame, and
    concatenates the results into a single DataFrame. It also performs some
    string cleaning on the resulting DataFrame's columns.

    Args:
        _bp_df (pd.DataFrame): DataFrame containing file paths and other info.

    Returns:
        pd.DataFrame: A DataFrame containing the combined metadata information.
    """
    subframes = []
    for path_ix, df in _bp_df.groupby('path_index_csv'):
        print(path_ix)
        if 'DEMO' in str(path_ix):
            continue
        try:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', ' INPUT_COUNTS', ' SUBSET_COUNTS']
            ).set_index('STEM')
        except ValueError:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', 'INPUT_COUNTS', 'SUBSET_COUNTS']
            ).set_index('STEM')
        display(set_my_style(df.head(1).T, caption=str(
            path_ix.relative_to('/share/compling/data'))))
        subframes.append(df.join(path_info))

    meta_df = pd.concat(subframes)
    meta_df.iloc[:, -2:] = meta_df.iloc[:, -2:].apply(lambda x: x.str.strip())
    meta_df.columns = meta_df.columns.str.strip().str.lower()
    return meta_df



 ## **Create or read meta info**

In [3]:
if BIRC_META_CSV.is_file():
    meta_df = pd.read_csv(BIRC_META_CSV, index_col='data_key').convert_dtypes()
    print(f'Meta Info DataFrame loaded from "{BIRC_META_CSV}"')
else:
    bp_df = collect_subset_path_index()
    meta_df = collect_meta_info(bp_df)
    meta_df.to_csv(BIRC_META_CSV)

set_my_style(
    meta_df.head(1).T,
    caption='First Line of Meta DataFrame')

Meta Info DataFrame loaded from "/share/compling/projects/sanpi/info/BiRC/meta-info-full.csv"


data_key,apw_eng_199411
subset_stem,BIGRAM.apw_eng_199411
parent,bigram-Apw
name,BIGRAM.apw_eng_199411.conllu
path,/share/compling/data/sanpi/subsets/bigram_news/bigram-Apw/BIGRAM.apw_eng_199411.conllu
source_path,/share/compling/data/news/Apw.conll/subset_bigram/BIGRAM.apw_eng_199411.conllu
birc_subcorpus,bigram_news
is_link,True
size,11162354
init_conllu,apw_eng_199411.conllu
corpus_part,Apw




 ## *Define functions* to retrieve count data

In [4]:
def load_totals(counts_path):
    """Load total counts from a JSON file.

    This function reads a JSON file containing counts data, extracts the 'total'
    column, drops any missing values (NaN), and returns the result as a dictionary.
    If the provided path is not absolute, it prepends the sanpi data directory.

    Args:
        counts_path (str or Path): Path to the JSON file.

    Returns:
        dict: A dictionary containing the total counts, with keys corresponding to
            the original index of the 'total' column.
    """

    if not Path(counts_path).is_absolute():
        counts_path = f"/share/compling/data/sanpi/{counts_path}"
    counts_df = pd.read_json(counts_path)
    return counts_df.loc[:, 'total'].dropna().to_dict()


def generate_counts(_meta_df, retrieval_key='input'):
    """Generate counts for each data key.

    This generator iterates through the index of the input DataFrame, retrieves the
    counts path based on the retrieval key, and yields the data key along with its
    corresponding counts.

    Args:
        _meta_df (pd.DataFrame): DataFrame containing metadata information.
        retrieval_key (str, optional): Key used to retrieve the counts path.
            Defaults to 'input'.

    Yields:
        tuple: A tuple containing the data key and its corresponding counts.
    """
    for data_key in _meta_df.index:
        counts_path = _meta_df.at[data_key, f'{retrieval_key}_counts']
        yield data_key, load_totals(counts_path)


def retrieve_count_data(_counts_csv_path, _meta_df, test=False):
    seek_birc = "birc" in _counts_csv_path.name.lower()
    _counts_label = "BiRC" if seek_birc else "Original"
    if _counts_csv_path.is_file():
        try:
            _counts_df = pd.read_csv(_counts_csv_path,
                                     index_col='corpus_slice').convert_dtypes()
        except ValueError:
            _counts_df = pd.read_csv(_counts_csv_path).convert_dtypes()

        print(
            f'**{_counts_label} Counts** read from csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
    else:
        _meta_selection = _meta_df.copy().sample(100) if test else _meta_df.copy()
        _counts_df = pd.DataFrame.from_dict(
            {k: c for k, c in generate_counts(
                _meta_df=_meta_selection,
                retrieval_key='subset' if seek_birc else 'input')},
            orient='index')
        _counts_df.index.name = 'corpus_slice'
        _counts_df = _counts_df.reset_index().convert_dtypes().set_index('corpus_slice')
        print(
            f'**{_counts_label} Counts** saved as csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
        if test:
            print('TESTING (so, not really)')
        else:
            _counts_df.to_csv(_counts_csv_path)
    return _counts_df.loc[:, ~_counts_df.columns.str.startswith('NR_')]



 ## **Load <font color=DodgerBlue><u>Original</u></font> counts** (if not previously collected)

In [5]:
orig_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('original-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
)

orig_counts_df.info()

**Original Counts** read from csv
> Path: info/BiRC/original-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
dtypes: Float64(1), Int64(14)
memory usage: 505.1+ KB




 ## **...and <font color=Brown><u>BiRC</u></font> counts** (likewise)

In [6]:
birc_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('birc-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
)

birc_counts_df.info()
# if birc_counts_composite_csv.is_file():
#     birc_counts_df = pd.read_csv(birc_counts_composite_csv).convert_dtypes()
#     print(
#         f'**BiRC Counts** read from csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')
# else:
#     birc_counts_df = pd.DataFrame.from_dict(
#         # {k:c for k,c in generate_counts(meta_df.head(500), 'subset')}, orient='index')
#         {k: c for k, c in generate_counts(meta_df, 'subset')}, orient='index')
#     pd.to_csv(birc_counts_composite_csv)
#     print(
#         f'**BiRC Counts** saved as csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')

**BiRC Counts** read from csv
> Path: info/BiRC/birc-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
dtypes: Float64(1), Int64(14)
memory usage: 505.1+ KB




 ## *Describe Count Collections*

In [7]:
save_latex_table(orig_counts_df.describe().T.iloc[:, 1:].assign(Total=orig_counts_df.sum()).filter(['mean', '50%','Total']).convert_dtypes(),
                 caption='Original Corpora: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=7.1,
                 latex_stem='only-counts-descrip-stats-orig')

save_latex_table(birc_counts_df.describe().T.iloc[:, 1:].assign(Total=birc_counts_df.sum()).filter(['mean', '50%','Total']).convert_dtypes(),
                 caption='BiRC: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=7.1,
                 latex_stem='only-counts-descrip-stats-birc')

Caption: Original Corpora: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,Total
,,,
file\_MB,376.642759,381.31,1362316.86
sentences,264905.37,252229.0,958162724.0
tokens,7963980.62,7844694.0,28805717900.0
ADV\_tokens,211630.73431,207234.0,765468366.0
ADV\_xpos,3.0,3.0,10851.0
ADV\_lemmas,2373.409732,2380.0,8584623.0
ADV\_forms,3022.908211,3076.0,10933859.0
ADJ\_tokens,344358.620957,325172.0,1245545132.0
ADJ\_xpos,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-orig.2025-02-18.tex

Caption: BiRC: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,Total
,,,
file\_MB,48.411269,50.12,175103.56
sentences,24564.02,24232.0,88848070.0
tokens,744482.66,753252.0,2692793781.0
ADV\_tokens,51777.230301,51673.0,187278242.0
ADV\_xpos,3.0,3.0,10851.0
ADV\_lemmas,1345.696434,1361.0,4867384.0
ADV\_forms,1653.0564,1692.0,5979105.0
ADJ\_tokens,63886.013547,62800.0,231075711.0
ADJ\_xpos,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-birc.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-birc.2025-02-18.tex')

In [8]:
def add_rate_cols(_df):
    return _df.assign(
        tok_per_sent=_df.tokens / _df.sentences,
        ADV_tok_per_sent=_df.ADV_tokens / _df.sentences,
        ADJ_tok_per_sent=_df.ADJ_tokens / _df.sentences,
        NEG_tok_per_sent=_df.NEG_tokens / _df.sentences,
        NEG_tok_per_mill=(_df.NEG_tokens / _df.tokens)*(10**6),
        ADV_tok_per_mill=(_df.ADV_tokens / _df.tokens)*(10**6),
        ADJ_tok_per_mill=(_df.ADJ_tokens / _df.tokens)*(10**6),
        ADV_form_per_lemma=_df.ADV_forms / _df.ADV_lemmas,
        # ADV_lemma_per_form= _df.ADV_lemmas / _df.ADV_forms,
        ADV_tok_per_lemma=_df.ADV_tokens / _df.ADV_lemmas,
        ADV_tok_per_form=_df.ADV_tokens / _df.ADV_forms,
        ADJ_form_per_lemma=_df.ADJ_forms / _df.ADJ_lemmas,
        # ADJ_lemma_per_form= _df.ADJ_lemmas / _df.ADJ_forms,
        ADJ_tok_per_lemma=_df.ADJ_tokens / _df.ADJ_lemmas,
        ADJ_tok_per_form=_df.ADJ_tokens / _df.ADJ_forms,
        # NEG_lemma_per_form= _df.NEG_lemmas / _df.NEG_forms,
        NEG_form_per_lemma=_df.NEG_forms / _df.NEG_lemmas,
        NEG_tok_per_lemma=_df.NEG_tokens / _df.NEG_lemmas,
    )


orig_counts_df = add_rate_cols(orig_counts_df)
birc_counts_df = add_rate_cols(birc_counts_df)
samix = orig_counts_df.sample(4).sort_index().index
nb_display(set_my_style(orig_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for Original Counts'))
nb_display(set_my_style(birc_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for BiRC Counts'))

corpus_slice,apw_eng_200703,pcc_eng_00-041,pcc_eng_04-100,pcc_eng_14-056
tok_per_sent,23.78,31.11,31.12,31.13
ADV_tok_per_sent,0.53,0.83,0.81,0.82
ADJ_tok_per_sent,1.38,1.29,1.28,1.28
NEG_tok_per_sent,0.13,0.17,0.16,0.17
NEG_tok_per_mill,5266.44,5400.16,5287.51,5321.69
ADV_tok_per_mill,22403.64,26583.96,26037.86,26302.53
ADJ_tok_per_mill,57915.99,41436.97,41018.66,41187.93
ADV_form_per_lemma,1.11,1.28,1.3,1.29
ADV_tok_per_lemma,152.02,86.53,85.95,89.44
ADV_tok_per_form,136.85,67.54,66.32,69.16


corpus_slice,apw_eng_200703,pcc_eng_00-041,pcc_eng_04-100,pcc_eng_14-056
tok_per_sent,23.6,31.14,31.13,31.14
ADV_tok_per_sent,1.76,2.14,2.11,2.16
ADJ_tok_per_sent,2.7,2.59,2.59,2.6
NEG_tok_per_sent,0.21,0.32,0.31,0.32
NEG_tok_per_mill,8840.34,10168.94,10118.83,10287.75
ADV_tok_per_mill,74735.43,68590.78,67883.27,69244.24
ADJ_tok_per_mill,114217.96,83296.03,83255.13,83528.52
ADV_form_per_lemma,1.1,1.23,1.24,1.25
ADV_tok_per_lemma,55.07,37.37,37.93,39.34
ADV_tok_per_form,50.24,30.48,30.49,31.59


In [9]:
# %%

save_latex_table(orig_counts_df.describe().T.filter(['mean', '50%','min','max']).assign(
    Total=orig_counts_df.sum(), 
    CVx100=(orig_counts_df.std()/orig_counts_df.mean()) * 100
).convert_dtypes(),
                 caption='Original Corpora Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='plus-rates-main-stats-orig')

save_latex_table(birc_counts_df.describe().T.filter(['mean', '50%','min','max']).assign(
    Total=birc_counts_df.sum(), 
    CVx100=(birc_counts_df.std()/birc_counts_df.mean()) * 100
).convert_dtypes(),
                 caption='BiRC Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='plus-rates-main-stats-birc')

save_latex_table(orig_counts_df.filter(like='per').describe().T.filter(['mean', '50%','min','max']).assign(
    Total=orig_counts_df.sum(), 
    CVx100=(orig_counts_df.std()/orig_counts_df.mean()) * 100
).convert_dtypes(),
                 caption='Original Corpora Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='only-rates-main-stats-orig')

save_latex_table(birc_counts_df.filter(like='per').describe().T.filter(['mean', '50%','min','max']).assign(
    Total=birc_counts_df.sum(), 
    CVx100=(birc_counts_df.std()/birc_counts_df.mean()) * 100
).convert_dtypes(),
                 caption='BiRC Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='only-rates-main-stats-birc')

Caption: Original Corpora Counts & Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CVx100
,,,,,,
file\_MB,376.642759,381.31,1.16,693.62,1362316.86,12.764588
sentences,264905.37,252229.0,1147.0,820724.0,958162724.0,25.48
tokens,7963980.62,7844694.0,27017.0,19532928.0,28805717900.0,17.46
ADV\_tokens,211630.73431,207234.0,840.0,570729.0,765468366.0,18.714909
ADV\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0
ADV\_lemmas,2373.409732,2380.0,185.0,3473.0,8584623.0,9.654474
ADV\_forms,3022.908211,3076.0,195.0,3808.0,10933859.0,10.116005
ADJ\_tokens,344358.620957,325172.0,1809.0,1076937.0,1245545132.0,25.906299
ADJ\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-orig.2025-02-18.tex

Caption: BiRC Counts & Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CVx100
,,,,,,
file\_MB,48.411269,50.12,0.09,76.52,175103.56,16.168846
sentences,24564.02,24232.0,78.0,66997.0,88848070.0,19.25
tokens,744482.66,753252.0,1836.0,1591526.0,2692793781.0,15.54
ADV\_tokens,51777.230301,51673.0,142.0,128678.0,187278242.0,17.999014
ADV\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0
ADV\_lemmas,1345.696434,1361.0,63.0,1963.0,4867384.0,11.214536
ADV\_forms,1653.0564,1692.0,66.0,2107.0,5979105.0,11.926217
ADJ\_tokens,63886.013547,62800.0,213.0,177632.0,231075711.0,20.226504
ADJ\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-birc.2025-02-18.tex

Caption: Original Corpora Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CVx100
,,,,,,
tok\_per\_sent,30.375293,31.125939,1.744827,32.176999,109867.434803,7.563742
ADV\_tok\_per\_sent,0.806727,0.821356,0.389156,0.886415,2917.932666,7.384976
ADJ\_tok\_per\_sent,1.299893,1.289703,0.952974,1.648766,4701.714696,4.040385
NEG\_tok\_per\_sent,0.167069,0.16797,0.092644,0.223678,604.289089,6.053056
NEG\_tok\_per\_mill,5551.801189,5402.466263,3929.536285,96553.464313,20080864.900171,28.995261
ADV\_tok\_per\_mill,26718.41561,26415.459711,16506.162419,479278.349333,96640509.261629,28.867916
ADJ\_tok\_per\_mill,43374.635341,41442.631915,37574.922799,737301.723841,156886056.027471,29.612538
ADV\_form\_per\_lemma,1.272915,1.291615,1.054054,1.327456,4604.135303,4.701489
ADV\_tok\_per\_lemma,88.876853,87.1162,4.540541,176.096575,321467.575759,13.099216


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-orig.2025-02-18.tex

Caption: BiRC Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CVx100
,,,,,,
tok\_per\_sent,30.390398,31.139457,23.524093,32.195978,109922.067876,7.45249
ADV\_tok\_per\_sent,2.107402,2.1333,1.581904,2.228142,7622.473013,4.424594
ADJ\_tok\_per\_sent,2.598741,2.592927,2.271796,3.075629,9399.644986,1.681286
NEG\_tok\_per\_sent,0.311711,0.320608,0.163388,0.363451,1127.45942,9.827857
NEG\_tok\_per\_mill,10238.704921,10311.474769,6901.384613,13071.895425,37033395.700265,4.379358
ADV\_tok\_per\_mill,69566.598296,68617.616136,65676.41422,84889.384072,251622386.03512,4.875261
ADJ\_tok\_per\_mill,86166.7605,83283.729315,79799.685689,120159.50458,311665172.727584,10.357154
ADV\_form\_per\_lemma,1.226534,1.242447,1.034682,1.297749,4436.37484,4.268204
ADV\_tok\_per\_lemma,38.281514,38.022828,2.253968,73.029512,138464.237438,10.919486


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-birc.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-birc.2025-02-18.tex')



 ## *Calculate **Reduction*** (<code>BiRC - Full</code>)

In [10]:
reduction_df = birc_counts_df - orig_counts_df
# iloc[:, 1:] drops the uninformtative `count` column
# save_latex_table(reduction_df.describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
#                  caption='Reduction (BiRC -- Full) Counts & Rates: Descriptive Stats', verbose=True,
#                  latex_subdir=REL_BIRC_TEX_DIR,
#                  latex_stem='plus-rates-birc-minus-full-descrip-stats')
# save_latex_table(reduction_df.filter(like='per').describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
#                  caption='Reduction (BiRC -- Full) Rates: Descriptive Stats', verbose=True,
#                  latex_subdir=REL_BIRC_TEX_DIR,
#                  latex_stem='only-rates-birc-minus-full-descrip-stats')
# nb_display(set_my_style(reduction_df.describe().T.assign(Total=reduction_df.sum()).convert_dtypes(),
#                         caption='BiRC <i>Reduction</i>: Descriptive Stats', precision=1))
nb_display(set_my_style(reduction_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for <code>BiRC - Full</code> counts'))

corpus_slice,apw_eng_200703,pcc_eng_00-041,pcc_eng_04-100,pcc_eng_14-056
tok_per_sent,-0.18,0.03,0.0,0.01
ADV_tok_per_sent,1.23,1.31,1.3,1.34
ADJ_tok_per_sent,1.32,1.3,1.31,1.32
NEG_tok_per_sent,0.08,0.15,0.15,0.15
NEG_tok_per_mill,3573.89,4768.78,4831.32,4966.05
ADV_tok_per_mill,52331.79,42006.82,41845.41,42941.71
ADJ_tok_per_mill,56301.97,41859.05,42236.47,42340.58
ADV_form_per_lemma,-0.01,-0.05,-0.05,-0.05
ADV_tok_per_lemma,-96.95,-49.16,-48.02,-50.1
ADV_tok_per_form,-86.61,-37.07,-35.83,-37.57


In [11]:
# %%

def reconfigure_counts(_counts_df, count_kind: str):
    return _counts_df.assign(
        kind=count_kind
    ).reset_index().set_index(['kind', 'corpus_slice']).unstack('kind')


orig_s = reconfigure_counts(orig_counts_df, count_kind='Full')
birc_s = reconfigure_counts(birc_counts_df, 'BiRC')
diff_s = reconfigure_counts(reduction_df, 'diff')
juxta = orig_s.join(birc_s).join(diff_s).convert_dtypes().sort_index(axis=1)

# juxta = orig_counts_df.join(birc_counts_df, rsuffix=':BiRC', lsuffix=':Full').join(reduction_df.add_suffix(':diff')).convert_dtypes()
juxta_desc = juxta.describe().iloc[1:, :]
juxta_desc.loc['TOTAL', :] = juxta.sum()
juxta_desc.index = juxta_desc.index.str.upper()
juxta_desc.loc['CV%', :] = (
    (juxta_desc.T.STD / juxta_desc.T.MEAN).fillna(0) * 100)

save_path = save_latex_table(juxta_desc.T.convert_dtypes(),
                 caption='Juxtaposed BiRC: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR,
                 latex_stem='birc-juxtaposed-descrip-stats-complete')
# if save_path is None: 
#     nb_display(set_my_style(juxta_desc.T.convert_dtypes()))
    


! Error: Table has more than 50 rows---too long. Reconsider what you want to include.
  NO TABLE SAVED.


Unnamed: 0_level_0,Unnamed: 1_level_0,MEAN,STD,MIN,25%,50%,75%,MAX,TOTAL,CV%
Unnamed: 0_level_1,kind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADJ_form_per_lemma,BiRC,1.1,0.03,1.0,1.1,1.11,1.11,1.13,3969.33,2.43
ADJ_form_per_lemma,Full,1.14,0.04,1.01,1.15,1.15,1.15,1.21,4107.78,3.19
ADJ_form_per_lemma,diff,-0.04,0.01,-0.09,-0.04,-0.04,-0.04,-0.0,-138.45,-26.94
ADJ_forms,BiRC,6928.6,1027.29,145.0,6788.0,6867.0,6955.0,12550.0,25060731.0,14.83
ADJ_forms,Full,18132.96,3441.84,653.0,17249.0,17393.0,17553.0,40339.0,65586917.0,18.98
ADJ_forms,diff,-11204.36,2636.3,-27789.0,-10649.0,-10531.0,-10431.0,-508.0,-40526186.0,-23.53
ADJ_lemmas,BiRC,6321.67,1047.61,145.0,6135.0,6205.0,6287.0,12298.0,22865464.0,16.57
ADJ_lemmas,Full,16051.86,3710.77,648.0,15030.0,15164.0,15316.0,39164.0,58059580.0,23.12
ADJ_lemmas,diff,-9730.2,2852.52,-26866.0,-9058.0,-8951.0,-8862.0,-503.0,-35194116.0,-29.32
ADJ_tok_per_form,BiRC,9.18,0.81,1.47,9.05,9.14,9.24,15.08,33199.51,8.82


In [12]:
juxta = juxta.assign(
    corpus=juxta.index.to_series().str[:3].map(
        {'apw': 'News', 'nyt': 'News', 'pcc': 'Puddin'}).astype('category'))
juxta = juxta.reset_index().set_index(['corpus']+juxta.index.names)
juxta = juxta.sort_index().sort_index(axis=1)

In [13]:
juxta.filter(like='tokens').sum().unstack(0)

Unnamed: 0_level_0,ADJ_tokens,ADV_tokens,NEG_tokens,tokens
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BiRC,231075711,187278242,27663509,2692793781
Full,1245545132,765468366,159335611,28805717900
diff,-1014469421,-578190124,-131672102,-26112924119


In [14]:
corpus_tokens_stack = juxta.stack(1).filter(like='tokens')
corpus_tokens_stack.columns.name = 'token_type'
corpus_tokens_stack = corpus_tokens_stack.stack().to_frame('count').reset_index(level=['corpus','kind', 'token_type']).sort_values(['corpus','kind', 'token_type'])
corpus_tokens_stack.value_counts(['corpus', 'kind', 'token_type'])


corpus  kind  token_type
Puddin  Full  ADJ_tokens    3247
              ADV_tokens    3247
              NEG_tokens    3247
              tokens        3247
        BiRC  ADJ_tokens    3247
              ADV_tokens    3247
              NEG_tokens    3247
              tokens        3247
        diff  ADJ_tokens    3247
              ADV_tokens    3247
              NEG_tokens    3247
              tokens        3247
News    BiRC  tokens         370
              NEG_tokens     370
              ADV_tokens     370
              ADJ_tokens     370
        Full  tokens         370
              NEG_tokens     370
              ADV_tokens     370
              ADJ_tokens     370
        diff  ADJ_tokens     370
              ADV_tokens     370
              NEG_tokens     370
              tokens         370
Name: count, dtype: int64

In [17]:
corpus_kind_token_totals=(pd.concat(
    [style_crosstab(corpus_tokens_stack,
                    ['corpus', 'kind'], ['token_type'], value_col='count', aggfunc='sum',
                    return_cross_df=True
                    ).sort_index().sort_index(axis=1),
     style_crosstab(corpus_tokens_stack,
                    ['kind'], ['token_type'], value_col='count', aggfunc='sum',
                    return_cross_df=True
                    ).assign(corpus='News+Puddin').reset_index().set_index(['corpus', 'kind']).sort_index()
     ]
).sort_index(axis=1)
)
save_latex_table(corpus_kind_token_totals, 
                 caption='BiRC Token Count Comparisons by Corpus \& Overall', 
                 latex_stem='birc-token-counts-by-corpus-kind',
                 latex_subdir=REL_BIRC_TEX_DIR, verbose=True, default_SI=-11.1, 
                 neg_color='BrickRed'
                                        )

Caption: BiRC Token Count Comparisons by Corpus \& Overall
*{2}{l}S[table-auto-round, table-format=-11.1, drop-zero-decimal, negative-color={BrickRed}]
    S[table-auto-round, table-format=-11.1, drop-zero-decimal, negative-color={BrickRed}]
    S[table-auto-round, table-format=-11.1, drop-zero-decimal, negative-color={BrickRed}]
    S[table-auto-round, table-format=-11.1, drop-zero-decimal, negative-color={BrickRed}]


Unnamed: 0_level_0,token_type,ADJ\_tokens,ADV\_tokens,NEG\_tokens,tokens
Corpus,Kind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
News,BiRC,27043166,19147268,2351756,240357823
News,Full,190371676,91955223,21611437,3300216537
News,diff,-163328510,-72807955,-19259681,-3059858714
Puddin,BiRC,204032545,168130974,25311753,2452435958
Puddin,Full,1055173456,673513143,137724174,25505501363
Puddin,diff,-851140911,-505382169,-112412421,-23053065405
News+Puddin,BiRC,231075711,187278242,27663509,2692793781
News+Puddin,Full,1245545132,765468366,159335611,28805717900
News+Puddin,diff,-1014469421,-578190124,-131672102,-26112924119


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/birc-token-counts-by-corpus-kind.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/birc-token-counts-by-corpus-kind.2025-02-18.tex')

In [None]:
# %%

save_latex_table(
    pd.concat([
        juxta.filter(like='apw', axis=0).sample(1), 
        juxta.filter(like='nyt', axis=0).sample(1),
        # 👆 one sample of apw  and nyt each
        # and then 2 puddin samples
        juxta.filter(like='pcc', axis=0).sample(2)
    ]).drop_duplicates().sort_index().T,
    caption=('Sample of Juxtaposed Counts'),
    verbose=True,
    latex_subdir=REL_BIRC_TEX_DIR,
    latex_stem='birc-juxtaposed-sample-2x2')

In [18]:
def describe_and_total(_df):
    return _df.describe().T.assign(total=_df.sum()).T


by_corpus_descrip = juxta.groupby(
    'corpus', observed=True).apply(describe_and_total)
by_corpus_descrip.index.names = ['corpus', 'stat']
by_corpus_descrip.columns.names = ['obs', 'kind']

by_corpus_descrip = (
    by_corpus_descrip
    .stack(['obs', 'kind'])
    .unstack(['corpus', 'kind', 'stat'])
    .stack('stat')
    .rename(index={
            'count': '# files',
            '50%': 'median'}))
nb_display(by_corpus_descrip)


Unnamed: 0_level_0,corpus,News,News,News,Puddin,Puddin,Puddin
Unnamed: 0_level_1,kind,BiRC,Full,diff,BiRC,Full,diff
obs,stat,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ADJ_form_per_lemma,25%,1.02,1.03,-0.01,1.1,1.15,-0.04
ADJ_form_per_lemma,median,1.02,1.03,-0.01,1.11,1.15,-0.04
ADJ_form_per_lemma,75%,1.02,1.03,-0.01,1.11,1.15,-0.04
ADJ_form_per_lemma,# files,370.0,370.0,370.0,3247.0,3247.0,3247.0
ADJ_form_per_lemma,max,1.03,1.04,-0.0,1.13,1.21,-0.02
ADJ_form_per_lemma,mean,1.02,1.03,-0.01,1.11,1.15,-0.04
ADJ_form_per_lemma,min,1.0,1.01,-0.02,1.09,1.14,-0.09
ADJ_form_per_lemma,std,0.0,0.0,0.0,0.0,0.0,0.0
ADJ_form_per_lemma,total,377.06,380.61,-3.55,3592.27,3727.18,-134.9
ADJ_forms,25%,4347.75,17381.75,-22292.0,6798.0,17248.0,-10611.0


In [None]:
# save_latex_table(
#     by_corpus_descrip.xs('file_MB').style,
#     caption=('BiRC Storare Size (MB) Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-size-compare-summary'
# )
# save_latex_table(
#     by_corpus_descrip.xs('tokens').style,
#     caption=('BiRC Token Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-token-compare-summary'
# )

# save_latex_table(
#     by_corpus_descrip.xs('sentences').style,
#     caption=('BiRC Sentence Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-sentence-compare-summary'
# )
