# Compilation of All Counts for <font color=DodgerBlue><u>*Original Corpora*</u></font> & <font color=Brown><u>***BiRC***</u></font>
___`BiRC`___ $=\text{``bigram restricted corpus'' or }\texttt{bigram\_subset}$

### *imports*

In [11]:
from pathlib import Path
import pandas as pd
from am_notebooks import *

#! Toggle TESTING
TESTING = False
if TESTING:
    print(
        'üö®‚ö†Ô∏è WARNING!\n  This is a **TEST** run!'
        'Full count data will not be loaded or saved\n'
    )

# // from utils import file_size_round
SUBSET_DATA_DIR = Path('/share/compling/data/sanpi/subsets')
print(
    '==============================\n'
    f'Date: {pd.Timestamp.now().ctime()}\n'
    '=============================='
)
BIGRAMS_PER_CONLLU_CSV = SANPI_HOME.joinpath('info/raw_bigram_counts.csv')
BIRC_INFO_DIR = SANPI_HOME.joinpath('info/BiRC')
confirm_dir(BIRC_INFO_DIR)
BIRC_META_CSV = BIRC_INFO_DIR.joinpath('meta-info-full.csv')
REL_BIRC_TEX_DIR ='ch6/BiRC'
BIRC_TEX_DIR = LATEX_TABLES.joinpath(REL_BIRC_TEX_DIR)
confirm_dir(BIRC_TEX_DIR)

Date: Tue Feb 18 22:56:09 2025


## **<font color=grey>Meta Info</font>**

### **Define functions** to compile all the disparate meta info...

In [12]:
def collect_subset_path_index():
    """Collect subset path index information.

    This function gathers information about subset paths, including parent directory,
    name, full path, source path (from symlink), BiRC subcorpus, size, and other
    relevant details. It then processes this information to create a DataFrame with
    additional derived columns, such as 'init_conllu', 'data_key', 'corpus_part',
    'corpus', 'subset_info_dir', and 'path_index_csv'.

    Returns:
        pd.DataFrame: A DataFrame containing the collected subset path index info.
    """
    data = {p.stem:
            {
                'parent': p.parent.name,
                'name': p.name,
                'path': p,
                'source_path': p.readlink(),
                'birc_subcorpus': p.parent.parent.stem,
                'is_link': p.is_symlink(),
                # 'size': file_size_round(p.stat().st_size),
                'size': p.stat().st_size,  # ! no longer forcing into different units
            }
            for p in SUBSET_DATA_DIR.rglob('*.conllu')
            #! to prevent *DEMO* files being included:
            if not p.parent.name.startswith('bigram-DEMO')
            }
    bp_df = pd.DataFrame.from_dict(data, orient='index').convert_dtypes()
    bp_df
    bp_df['init_conllu'] = bp_df.name.str.replace('BIGRAM.', '', regex=False)
    bp_df['data_key'] = bp_df.index.str.replace('BIGRAM.', '', regex=False)

    bp_df['corpus_part'], bp_df['corpus'], bp_df['subset_info_dir'] = zip(
        *bp_df.source_path.apply(
            lambda sp: (sp.parent.parent.stem,
                        sp.parent.parent.parent.stem,
                        sp.parent.joinpath('info')))
    )
    bp_df = bp_df.convert_dtypes()

    set_my_style(bp_df.sample(1).T)

    bp_df['path_index_csv'] = bp_df.subset_info_dir.apply(
        lambda i: max(i.glob('subset-bigram_path-index*csv'),
                      key=lambda file: file.stat().st_ctime))

    try:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', ' INPUT_COUNTS',
                                         ' SUBSET_COUNTS']
                                ).set_index('STEM')
    except ValueError:
        path_info = pd.read_csv(X.path_index_csv,
                                usecols=['STEM', 'INPUT_COUNTS',
                                         'SUBSET_COUNTS']
                                ).set_index('STEM')

    path_info.columns = path_info.columns.str.strip().str.lower()
    path_info.columns

    for stem in path_info.sample(3).T:
        print(f'- {stem}')
        print(f'  - input_counts = {path_info.input_counts[stem]}')
        print(f'  - subset_counts = {path_info.subset_counts[stem]}')

    bp_df.index.name = 'subset_stem'
    return bp_df.reset_index().set_index('data_key')


def collect_meta_info(_bp_df):
    """Collect metadata information from various sources.

    This function groups the input DataFrame by 'path_index_csv', reads data from
    corresponding CSV files, joins the data with the original DataFrame, and
    concatenates the results into a single DataFrame. It also performs some
    string cleaning on the resulting DataFrame's columns.

    Args:
        _bp_df (pd.DataFrame): DataFrame containing file paths and other info.

    Returns:
        pd.DataFrame: A DataFrame containing the combined metadata information.
    """
    subframes = []
    for path_ix, df in _bp_df.groupby('path_index_csv'):
        print(path_ix)
        if 'DEMO' in str(path_ix):
            continue
        try:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', ' INPUT_COUNTS', ' SUBSET_COUNTS']
            ).set_index('STEM')
        except ValueError:
            path_info = pd.read_csv(
                path_ix,
                usecols=['STEM', 'INPUT_COUNTS', 'SUBSET_COUNTS']
            ).set_index('STEM')
        display(set_my_style(df.head(1).T, caption=str(
            path_ix.relative_to('/share/compling/data'))))
        subframes.append(df.join(path_info))

    meta_df = pd.concat(subframes)
    meta_df.iloc[:, -2:] = meta_df.iloc[:, -2:].apply(lambda x: x.str.strip())
    meta_df.columns = meta_df.columns.str.strip().str.lower()
    return meta_df

### **Create or read meta info**

In [13]:
if BIRC_META_CSV.is_file():
    meta_df = pd.read_csv(BIRC_META_CSV, index_col='data_key').convert_dtypes()
    print(f'Meta Info DataFrame loaded from "{BIRC_META_CSV}"')
else:
    bp_df = collect_subset_path_index()
    meta_df = collect_meta_info(bp_df)
    meta_df.to_csv(BIRC_META_CSV)

set_my_style(
    meta_df.head(1).T,
    caption='First Line of Meta DataFrame')

Meta Info DataFrame loaded from "/share/compling/projects/sanpi/info/BiRC/meta-info-full.csv"


data_key,apw_eng_199411
subset_stem,BIGRAM.apw_eng_199411
parent,bigram-Apw
name,BIGRAM.apw_eng_199411.conllu
path,/share/compling/data/sanpi/subsets/bigram_news/bigram-Apw/BIGRAM.apw_eng_199411.conllu
source_path,/share/compling/data/news/Apw.conll/subset_bigram/BIGRAM.apw_eng_199411.conllu
birc_subcorpus,bigram_news
is_link,True
size,11162354
init_conllu,apw_eng_199411.conllu
corpus_part,Apw


## **<font color=violet>Frequency Info</font>**

### Total Bigram *hits* per file

Total bigram hits‚Äî‚Äînot *sentences* or *tokens*‚Äî‚Äîare loaded from `BIGRAMS_PER_CONLLU_CSV` (i.e. `info/raw_bigram_counts.csv`). 
It looks like this: 
> ```csv
> raw_hits_path,total
> bigram-Apw.rb-bigram/BIGRAM.apw_eng_199411.raw.json,9770
> bigram-Apw.rb-bigram/BIGRAM.apw_eng_199412.raw.json,15134
> bigram-Apw.rb-bigram/BIGRAM.apw_eng_199501.raw.json,15909
> ```
First column indicates the `.raw.json` collection of `rb-bigram` matches found in the associated `.conllu` file, e.g.: \

- first line: `bigram-Apw.rb-bigram/BIGRAM.apw_eng_199411.raw.json,9770`

  indicates that

- `BIGRAM.apw_eng_199411.conllu` matched the `rb-bigram.pat` pattern specs $9,770$ times

Since this is the `rb-bigram` pattern which was used for *creating* `BIGRAM.apw_eng_199411.conllu`, the original, unrestricted `apw_eng_199411.conllu` file also contains $9,770$ hits. 

In [14]:
def load_bigram_totals():

    bg_totals = pd.read_csv(BIGRAMS_PER_CONLLU_CSV).convert_dtypes().rename(columns={'total':'bigrams'})
    bg_totals['corpus_slice']=bg_totals.raw_hits_path.apply(lambda p: Path(p).name.split('.')[1])
    return bg_totals.set_index('corpus_slice').bigrams

BIRC_HIT_COUNTS = load_bigram_totals()
BIRC_HIT_COUNTS

corpus_slice
apw_eng_199411      9770
apw_eng_199412     15134
apw_eng_199501     15909
apw_eng_199502     13613
apw_eng_199503     16157
                   ...  
pcc_eng_test-02    29949
pcc_eng_test-03    29787
pcc_eng_val-01     26376
pcc_eng_val-02     29839
pcc_eng_val-03     29050
Name: bigrams, Length: 3617, dtype: Int64

### *Define functions* to retrieve count data

In [15]:
def load_totals(counts_path):
    """Load total counts from a JSON file.

    This function reads a JSON file containing counts data, extracts the 'total'
    column, drops any missing values (NaN), and returns the result as a dictionary.
    If the provided path is not absolute, it prepends the sanpi data directory.

    Args:
        counts_path (str or Path): Path to the JSON file.

    Returns:
        dict: A dictionary containing the total counts, with keys corresponding to
            the original index of the 'total' column.
    """

    if not Path(counts_path).is_absolute():
        counts_path = f"/share/compling/data/sanpi/{counts_path}"
    counts_df = pd.read_json(counts_path)
    return counts_df.loc[:, 'total'].dropna().to_dict()


def generate_counts(_meta_df, retrieval_key='input'):
    """Generate counts for each data key.

    This generator iterates through the index of the input DataFrame, retrieves the
    counts path based on the retrieval key, and yields the data key along with its
    corresponding counts.

    Args:
        _meta_df (pd.DataFrame): DataFrame containing metadata information.
        retrieval_key (str, optional): Key used to retrieve the counts path.
            Defaults to 'input'.

    Yields:
        tuple: A tuple containing the data key and its corresponding counts.
    """
    for data_key in _meta_df.index:
        counts_path = _meta_df.at[data_key, f'{retrieval_key}_counts']
        yield data_key, load_totals(counts_path)


def retrieve_count_data(_counts_csv_path, _meta_df, test=False):
    seek_birc = "birc" in _counts_csv_path.name.lower()
    _counts_label = "BiRC" if seek_birc else "Original"
    if _counts_csv_path.is_file():
        try:
            _counts_df = pd.read_csv(_counts_csv_path,
                                     index_col='corpus_slice').convert_dtypes()
        except ValueError:
            _counts_df = pd.read_csv(_counts_csv_path).convert_dtypes()

        print(
            f'**{_counts_label} Counts** read from csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
    else:
        _meta_selection = _meta_df.copy().sample(100) if test else _meta_df.copy()
        _counts_df = pd.DataFrame.from_dict(
            {k: c for k, c in generate_counts(
                _meta_df=_meta_selection,
                retrieval_key='subset' if seek_birc else 'input')},
            orient='index')
        _counts_df.index.name = 'corpus_slice'
        _counts_df = _counts_df.reset_index().convert_dtypes().set_index('corpus_slice')
        print(
            f'**{_counts_label} Counts** saved as csv',
            f'> Path: {_counts_csv_path.relative_to(SANPI_HOME)}',
            sep='\n')
        if test:
            print('TESTING (so, not really)')
        else:
            _counts_df.to_csv(_counts_csv_path)
    return _counts_df.loc[:, ~_counts_df.columns.str.startswith('NR_')]

### **Load <font color=DodgerBlue><u>Original</u></font> counts** (if not previously collected)

 (and add *bigrams* column from `BIRC_HIT_COUNTS`)

In [16]:
orig_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('original-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
).sort_index().assign(bigrams=BIRC_HIT_COUNTS.sort_index())

orig_counts_df.info()

**Original Counts** read from csv
> Path: info/BiRC/original-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
 15  bigrams     3617 non-null   Int64  
dtypes: Float64(1), Int64(15)
memory usage: 536.9+ KB


### **...and <font color=Brown><u>BiRC</u></font> counts** (likewise)

(and add *bigrams* column from `BIRC_HIT_COUNTS`)

In [17]:
birc_counts_df = retrieve_count_data(
    _counts_csv_path=BIRC_INFO_DIR.joinpath('birc-count-data.csv'),
    _meta_df=meta_df,
    test=TESTING
).sort_index().assign(bigrams=BIRC_HIT_COUNTS.sort_index())

birc_counts_df.info()
# if birc_counts_composite_csv.is_file():
#     birc_counts_df = pd.read_csv(birc_counts_composite_csv).convert_dtypes()
#     print(
#         f'**BiRC Counts** read from csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')
# else:
#     birc_counts_df = pd.DataFrame.from_dict(
#         # {k:c for k,c in generate_counts(meta_df.head(500), 'subset')}, orient='index')
#         {k: c for k, c in generate_counts(meta_df, 'subset')}, orient='index')
#     pd.to_csv(birc_counts_composite_csv)
#     print(
#         f'**BiRC Counts** saved as csv\n> Path: {birc_counts_composite_csv.relative_to(SANPI_HOME)}')

**BiRC Counts** read from csv
> Path: info/BiRC/birc-count-data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 3617 entries, apw_eng_199411 to pcc_eng_val-03
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   file_MB     3617 non-null   Float64
 1   sentences   3617 non-null   Int64  
 2   tokens      3617 non-null   Int64  
 3   ADV_tokens  3617 non-null   Int64  
 4   ADV_xpos    3617 non-null   Int64  
 5   ADV_lemmas  3617 non-null   Int64  
 6   ADV_forms   3617 non-null   Int64  
 7   ADJ_tokens  3617 non-null   Int64  
 8   ADJ_xpos    3617 non-null   Int64  
 9   ADJ_lemmas  3617 non-null   Int64  
 10  ADJ_forms   3617 non-null   Int64  
 11  NEG_tokens  3617 non-null   Int64  
 12  NEG_xpos    3617 non-null   Int64  
 13  NEG_lemmas  3617 non-null   Int64  
 14  NEG_forms   3617 non-null   Int64  
 15  bigrams     3617 non-null   Int64  
dtypes: Float64(1), Int64(15)
memory usage: 536.9+ KB


## *Describe Count Collections*

In [18]:
save_latex_table(orig_counts_df.describe().T.iloc[:, 1:].assign(Total=orig_counts_df.sum()).filter(['mean', '50%','Total']).convert_dtypes(),
                 caption='Original Corpora: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=7.1,
                 latex_stem='only-counts-descrip-stats-orig')

save_latex_table(birc_counts_df.describe().T.iloc[:, 1:].assign(Total=birc_counts_df.sum()).filter(['mean', '50%','Total']).convert_dtypes(),
                 caption='BiRC: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=7.1,
                 latex_stem='only-counts-descrip-stats-birc')

Caption: Original Corpora: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,Total
,,,
file\_MB,376.642759,381.31,1362316.86
sentences,264905.37,252229.0,958162724.0
tokens,7963980.62,7844694.0,28805717900.0
ADV\_tokens,211630.73431,207234.0,765468366.0
ADV\_xpos,3.0,3.0,10851.0
ADV\_lemmas,2373.409732,2380.0,8584623.0
ADV\_forms,3022.908211,3076.0,10933859.0
ADJ\_tokens,344358.620957,325172.0,1245545132.0
ADJ\_xpos,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-orig.2025-02-18.tex

Caption: BiRC: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]
    S[table-auto-round, table-format=7.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,Total
,,,
file\_MB,48.411269,50.12,175103.56
sentences,24564.02,24232.0,88848070.0
tokens,744482.66,753252.0,2692793781.0
ADV\_tokens,51777.230301,51673.0,187278242.0
ADV\_xpos,3.0,3.0,10851.0
ADV\_lemmas,1345.696434,1361.0,4867384.0
ADV\_forms,1653.0564,1692.0,5979105.0
ADJ\_tokens,63886.013547,62800.0,231075711.0
ADJ\_xpos,3.0,3.0,10851.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-birc.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-counts-descrip-stats-birc.2025-02-18.tex')

## Add Rates

In [19]:
def add_rate_cols(_df):
    return _df.assign(
        tok_per_bigram=(_df.tokens / _df.bigrams),
        
        bigrams_per_mill=(_df.bigrams / _df.tokens)*(10**6),
        ADV_tok_per_mill=(_df.ADV_tokens / _df.tokens)*(10**6),
        ADJ_tok_per_mill=(_df.ADJ_tokens / _df.tokens)*(10**6),
        NEG_tok_per_mill=(_df.NEG_tokens / _df.tokens)*(10**6),
        
        tok_per_ksent=_df.tokens / _df.sentences * 1000,
        bigrams_per_ksent=_df.bigrams / _df.sentences * 1000,
        ADV_tok_per_ksent=_df.ADV_tokens / _df.sentences * 1000,
        ADJ_tok_per_ksent=_df.ADJ_tokens / _df.sentences * 1000,
        NEG_tok_per_ksent=_df.NEG_tokens / _df.sentences * 1000,
        
        ADV_tok_per_bigram=(_df.ADV_tokens / _df.bigrams),
        ADJ_tok_per_bigram=(_df.ADJ_tokens / _df.bigrams),
        
        ADV_tok_per_lemma=_df.ADV_tokens / _df.ADV_lemmas,
        ADJ_tok_per_lemma=_df.ADJ_tokens / _df.ADJ_lemmas,
        NEG_tok_per_lemma=_df.NEG_tokens / _df.NEG_lemmas,
        
        ADV_tok_per_form=_df.ADV_tokens / _df.ADV_forms,
        ADJ_tok_per_form=_df.ADJ_tokens / _df.ADJ_forms,
        NEG_tok_per_form=_df.NEG_tokens / _df.NEG_forms,
        
        ADV_form_per_lemma=_df.ADV_forms / _df.ADV_lemmas,
        ADJ_form_per_lemma=_df.ADJ_forms / _df.ADJ_lemmas,
        NEG_form_per_lemma=_df.NEG_forms / _df.NEG_lemmas,
    )


orig_counts_df = add_rate_cols(orig_counts_df)
birc_counts_df = add_rate_cols(birc_counts_df)
samix = orig_counts_df.sample(4).sort_index().index
nb_display(set_my_style(orig_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for Original Counts'))
nb_display(set_my_style(birc_counts_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for BiRC Counts'))

corpus_slice,apw_eng_199908,pcc_eng_03-017,pcc_eng_20-025,pcc_eng_21-007
tok_per_bigram,429.34,298.72,302.5,297.72
bigrams_per_mill,2329.14,3347.66,3305.76,3358.82
ADV_tok_per_mill,24833.16,26483.94,26443.46,26713.15
ADJ_tok_per_mill,63589.23,41431.46,41994.34,41595.78
NEG_tok_per_mill,5490.35,5404.38,5363.95,5457.62
tok_per_ksent,23607.64,31112.97,31107.91,31151.75
bigrams_per_ksent,54.99,104.16,102.84,104.63
ADV_tok_per_ksent,586.25,823.99,822.6,832.16
ADJ_tok_per_ksent,1501.19,1289.06,1306.36,1295.78
NEG_tok_per_ksent,129.61,168.15,166.86,170.01


corpus_slice,apw_eng_199908,pcc_eng_03-017,pcc_eng_20-025,pcc_eng_21-007
tok_per_bigram,23.2,28.85,28.8,28.69
bigrams_per_mill,43095.1,34664.39,34719.06,34854.12
ADV_tok_per_mill,73676.39,68134.61,68268.62,69519.07
ADJ_tok_per_mill,108298.08,82962.09,83120.05,84636.58
NEG_tok_per_mill,8614.45,10227.64,10177.57,10447.29
tok_per_ksent,23592.63,31132.81,31129.38,31152.86
bigrams_per_ksent,1016.73,1079.2,1080.78,1085.81
ADV_tok_per_ksent,1738.22,2121.22,2125.16,2165.72
ADJ_tok_per_ksent,2555.04,2582.84,2587.48,2636.67
NEG_tok_per_ksent,203.24,318.42,316.82,325.46


### *save latex tables*

In [20]:
def _stat(cdf):
    stat_df = cdf.describe().T.filter(['mean', '50%','min','max']).assign(
    Total=cdf.sum()
).convert_dtypes()
    stat_df['CV%']=(cdf.std()/cdf.mean()) * 100
    return stat_df

orig_stats = _stat(orig_counts_df)

save_latex_table(orig_stats,
                 caption='Original Corpora Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='plus-rates-main-stats-orig')


Caption: Original Corpora Counts & Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CV\%
,,,,,,
file\_MB,376.642759,381.31,1.16,693.62,1362316.86,12.764588
sentences,264905.37,252229.0,1147.0,820724.0,958162724.0,25.48
tokens,7963980.62,7844694.0,27017.0,19532928.0,28805717900.0,17.46
ADV\_tokens,211630.73431,207234.0,840.0,570729.0,765468366.0,18.714909
ADV\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0
ADV\_lemmas,2373.409732,2380.0,185.0,3473.0,8584623.0,9.654474
ADV\_forms,3022.908211,3076.0,195.0,3808.0,10933859.0,10.116005
ADJ\_tokens,344358.620957,325172.0,1809.0,1076937.0,1245545132.0,25.906299
ADJ\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-orig.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-orig.2025-02-18.tex')

In [21]:
birc_stats = _stat(birc_counts_df)

save_latex_table(birc_stats,
                 caption='BiRC Counts & Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='plus-rates-main-stats-birc')


Caption: BiRC Counts & Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CV\%
,,,,,,
file\_MB,48.411269,50.12,0.09,76.52,175103.56,16.168846
sentences,24564.02,24232.0,78.0,66997.0,88848070.0,19.25
tokens,744482.66,753252.0,1836.0,1591526.0,2692793781.0,15.54
ADV\_tokens,51777.230301,51673.0,142.0,128678.0,187278242.0,17.999014
ADV\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0
ADV\_lemmas,1345.696434,1361.0,63.0,1963.0,4867384.0,11.214536
ADV\_forms,1653.0564,1692.0,66.0,2107.0,5979105.0,11.926217
ADJ\_tokens,63886.013547,62800.0,213.0,177632.0,231075711.0,20.226504
ADJ\_xpos,3.0,3.0,3.0,3.0,10851.0,0.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-birc.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/plus-rates-main-stats-birc.2025-02-18.tex')

In [22]:

save_latex_table(orig_stats.filter(like='per',axis=0),
                 caption='Original Corpora Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='only-rates-main-stats-orig')


Caption: Original Corpora Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CV\%
,,,,,,
tok\_per\_bigram,303.960314,299.901924,25.517736,639.502205,1099424.456601,11.101998
bigrams\_per\_mill,3328.143801,3334.423423,1563.716266,39188.429733,12037896.128706,19.754245
ADV\_tok\_per\_mill,26718.41561,26415.459711,16506.162419,479278.349333,96640509.261629,28.867916
ADJ\_tok\_per\_mill,43374.635341,41442.631915,37574.922799,737301.723841,156886056.027471,29.612538
NEG\_tok\_per\_mill,5551.801189,5402.466263,3929.536285,96553.464313,20080864.900171,28.995261
tok\_per\_ksent,30375.293006,31125.938572,1744.826896,32176.998737,109867434.802504,7.563742
bigrams\_per\_ksent,100.919081,103.634969,36.866788,109.912579,365024.317581,10.733552
ADV\_tok\_per\_ksent,806.727306,821.356413,389.155764,886.41465,2917932.665806,7.384976
ADJ\_tok\_per\_ksent,1299.893474,1289.702735,952.974036,1648.765967,4701714.696015,4.040385


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-orig.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-orig.2025-02-18.tex')

In [23]:

save_latex_table(birc_stats.filter(like='per', axis=0),
                 caption='BiRC Rates: Descriptive Stats', verbose=True,
                 latex_subdir=REL_BIRC_TEX_DIR, default_SI=10.1,
                 latex_stem='only-rates-main-stats-birc')

Caption: BiRC Rates: Descriptive Stats
*{1}{l}S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]
    S[table-auto-round, table-format=10.1, drop-zero-decimal]


Unnamed: 0,mean,50\%,min,max,Total,CV\%
,,,,,,
tok\_per\_bigram,28.245656,28.817545,22.639803,29.896107,102164.53619,6.212876
bigrams\_per\_mill,35567.413546,34701.082847,33449.171241,44169.99542,128647334.797221,7.413079
ADV\_tok\_per\_mill,69566.598296,68617.616136,65676.41422,84889.384072,251622386.03512,4.875261
ADJ\_tok\_per\_mill,86166.7605,83283.729315,79799.685689,120159.50458,311665172.727584,10.357154
NEG\_tok\_per\_mill,10238.704921,10311.474769,6901.384613,13071.895425,37033395.700265,4.379358
tok\_per\_ksent,30390.397533,31139.456758,23524.093124,32195.978249,109922067.875727,7.45249
bigrams\_per\_ksent,1074.949182,1080.0,987.179487,1095.911693,3888091.193028,1.554945
ADV\_tok\_per\_ksent,2107.401994,2133.300071,1581.904052,2228.141808,7622473.012947,4.424594
ADJ\_tok\_per\_ksent,2598.740665,2592.926666,2271.795521,3075.628588,9399644.985559,1.681286


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-birc.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch6/BiRC/only-rates-main-stats-birc.2025-02-18.tex')



 ## *Calculate **Reduction*** (<code>BiRC - Full</code>)

In [24]:
reduction_df = birc_counts_df - orig_counts_df
# iloc[:, 1:] drops the uninformtative `count` column
# save_latex_table(reduction_df.describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
#                  caption='Reduction (BiRC -- Full) Counts & Rates: Descriptive Stats', verbose=True,
#                  latex_subdir=REL_BIRC_TEX_DIR,
#                  latex_stem='plus-rates-birc-minus-full-descrip-stats')
# save_latex_table(reduction_df.filter(like='per').describe().T.iloc[:, 1:].assign(Total=reduction_df.sum()).convert_dtypes(),
#                  caption='Reduction (BiRC -- Full) Rates: Descriptive Stats', verbose=True,
#                  latex_subdir=REL_BIRC_TEX_DIR,
#                  latex_stem='only-rates-birc-minus-full-descrip-stats')
# nb_display(set_my_style(reduction_df.describe().T.assign(Total=reduction_df.sum()).convert_dtypes(),
#                         caption='BiRC <i>Reduction</i>: Descriptive Stats', precision=1))
nb_display(set_my_style(reduction_df.filter(
    like='per').loc[samix, :].T, caption='"Rate" columns for <code>BiRC - Full</code> counts'))

corpus_slice,apw_eng_199908,pcc_eng_03-017,pcc_eng_20-025,pcc_eng_21-007
tok_per_bigram,-406.14,-269.87,-273.7,-269.03
bigrams_per_mill,40765.96,31316.74,31413.3,31495.3
ADV_tok_per_mill,48843.23,41650.67,41825.16,42805.93
ADJ_tok_per_mill,44708.84,41530.64,41125.7,43040.8
NEG_tok_per_mill,3124.1,4823.26,4813.62,4989.67
tok_per_ksent,-15.01,19.84,21.47,1.11
bigrams_per_ksent,961.74,975.04,977.95,981.17
ADV_tok_per_ksent,1151.97,1297.23,1302.56,1333.56
ADJ_tok_per_ksent,1053.84,1293.79,1281.12,1340.89
NEG_tok_per_ksent,73.62,150.27,149.96,155.45


In [25]:
reduction_stats = _stat(reduction_df)
nb_display(reduction_stats.filter(regex=r'token|sentence|bigram|MB', axis=0))


Unnamed: 0,mean,50%,min,max,Total,CV%
file_MB,-328.23,-331.15,-617.1,-1.07,-1187213.3,-12.8
sentences,-240341.35,-228003.0,-753727.0,-1069.0,-869314654.0,-26.53
tokens,-7219497.96,-7090899.0,-17941402.0,54335.0,-26112924119.0,-18.11
ADV_tokens,-159853.5,-155494.0,-442051.0,-698.0,-578190124.0,-19.52
ADJ_tokens,-280472.61,-262280.0,-899305.0,-1596.0,-1014469421.0,-28.26
NEG_tokens,-36403.68,-34595.0,-115331.0,-190.0,-131672102.0,-25.63
bigrams,0.0,0.0,0.0,0.0,0.0,
tok_per_bigram,-275.71,-271.1,-615.98,3.18,-997259.92,-12.46
bigrams_per_mill,32239.27,31366.82,-4343.04,41552.13,116609438.67,8.55
bigrams_per_ksent,974.03,976.23,920.05,1017.49,3523066.88,0.91


In [26]:
stats_diff = birc_stats - orig_stats
nb_display(stats_diff.filter(regex=r'token|sentence|bigram|MB', axis=0))

Unnamed: 0,mean,50%,min,max,Total,CV%
,,,,,,
file_MB,-328.23,-331.19,-1.07,-617.1,-1187213.3,3.4
sentences,-240341.35,-227997.0,-1069.0,-753727.0,-869314654.0,-6.24
tokens,-7219497.96,-7091442.0,-25181.0,-17941402.0,-26112924119.0,-1.92
ADV_tokens,-159853.5,-155561.0,-698.0,-442051.0,-578190124.0,-0.72
ADJ_tokens,-280472.61,-262372.0,-1596.0,-899305.0,-1014469421.0,-5.68
NEG_tokens,-36403.68,-34618.0,-190.0,-115331.0,-131672102.0,-6.0
bigrams,0.0,0.0,0.0,0.0,0.0,0.0
tok_per_bigram,-275.71,-271.08,-2.88,-609.61,-997259.92,-4.89
bigrams_per_mill,32239.27,31366.66,31885.45,4981.57,116609438.67,-12.34


In [27]:
stats_rediff = reduction_stats - stats_diff
nb_display(stats_rediff.filter(regex=r'token|sentence|bigram|MB', axis=0))


Unnamed: 0,mean,50%,min,max,Total,CV%
file_MB,-0.0,0.04,-616.03,616.03,-0.0,-16.2
sentences,0.0,-6.0,-752658.0,752658.0,0.0,-20.29
tokens,0.0,543.0,-17916221.0,17995737.0,0.0,-16.2
ADV_tokens,0.0,67.0,-441353.0,441353.0,0.0,-18.8
ADJ_tokens,0.0,92.0,-897709.0,897709.0,0.0,-22.58
NEG_tokens,0.0,23.0,-115141.0,115141.0,0.0,-19.63
bigrams,0.0,0.0,0.0,0.0,0.0,
tok_per_bigram,0.0,-0.02,-613.1,612.79,0.0,-7.57
bigrams_per_mill,-0.0,0.16,-36228.49,36570.56,-0.0,20.89
bigrams_per_ksent,0.0,-0.14,-30.26,31.49,0.0,10.09


In [28]:
# %%

def reconfigure_counts(_counts_df, count_kind: str):
    return _counts_df.assign(
        kind=count_kind
    ).reset_index().set_index(['kind', 'corpus_slice']).unstack('kind')


orig_s = reconfigure_counts(orig_counts_df, count_kind='Full')
birc_s = reconfigure_counts(birc_counts_df, 'BiRC')
diff_s = reconfigure_counts(reduction_df, 'diff')
juxta = orig_s.join(birc_s).join(diff_s).convert_dtypes().sort_index(axis=1)

# juxta = orig_counts_df.join(birc_counts_df, rsuffix=':BiRC', lsuffix=':Full').join(reduction_df.add_suffix(':diff')).convert_dtypes()
juxta_desc = juxta.describe().iloc[1:, :]
juxta_desc.loc['TOTAL', :] = juxta.sum()
juxta_desc.index = juxta_desc.index.str.upper()
juxta_desc.loc['CV%', :] = (
    (juxta_desc.T.STD / juxta_desc.T.MEAN).fillna(0) * 100)

# save_path = save_latex_table(juxta_desc.T.convert_dtypes(),
#                  caption='Juxtaposed BiRC: Descriptive Stats', verbose=True,
#                  latex_subdir=REL_BIRC_TEX_DIR,
#                  latex_stem='birc-juxtaposed-descrip-stats-complete')
# if save_path is None: 
#     nb_display(set_my_style(juxta_desc.T.convert_dtypes()))
juxta_desc.stack('kind')


Unnamed: 0_level_0,Unnamed: 1_level_0,ADJ_form_per_lemma,ADJ_forms,ADJ_lemmas,ADJ_tok_per_bigram,ADJ_tok_per_form,ADJ_tok_per_ksent,ADJ_tok_per_lemma,ADJ_tok_per_mill,ADJ_tokens,ADJ_xpos,...,NEG_tokens,NEG_xpos,bigrams,bigrams_per_ksent,bigrams_per_mill,file_MB,sentences,tok_per_bigram,tok_per_ksent,tokens
Unnamed: 0_level_1,kind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
MEAN,BiRC,1.09741,6928.595798,6321.665469,2.418362,9.178742,2598.740665,10.070426,86166.7605,63886.013547,3.0,...,7648.191595,9.828864,26399.329278,1074.949182,35567.413546,48.411269,24564.022671,28.245656,30390.397533,744482.659939
MEAN,Full,1.135688,18132.960188,16051.860658,13.180262,18.901968,1299.893474,21.445473,43374.635341,344358.620957,3.0,...,44051.869229,14.020182,26399.329278,100.919081,3328.143801,376.642759,264905.370196,303.960314,30375.293006,7963980.619298
MEAN,diff,-0.038278,-11204.36439,-9730.195189,-10.7619,-9.723226,1298.847191,-11.375047,42792.125159,-280472.607409,0.0,...,-36403.677633,-4.191319,0.0,974.030101,32239.269745,-328.23149,-240341.347526,-275.714659,15.104527,-7219497.959359
STD,BiRC,0.02669,1027.290358,1047.608208,0.065775,0.809496,43.692255,0.83423,8924.42402,12921.907138,0.0,...,1292.408428,1.380076,4871.131447,16.714873,2636.640591,7.827544,4727.443967,1.754868,2264.841388,115699.454022
STD,Full,0.036256,3441.836231,3710.774135,2.863657,2.194204,52.520707,2.180164,12844.330364,89210.573882,0.0,...,10086.446223,2.00618,4871.131447,10.832203,657.449687,48.076895,67510.849923,33.745667,2297.508874,1390308.391637
STD,diff,0.010314,2636.304786,2852.517447,2.824321,1.552152,45.074167,1.526052,12211.102686,79264.142497,0.0,...,9328.879327,1.70466,0.0,8.877768,2756.152844,42.014842,63751.013185,34.36072,489.738945,1307642.670797
MIN,BiRC,1.0,145.0,145.0,2.23856,1.468966,2271.795521,1.468966,79799.685689,213.0,3.0,...,24.0,5.0,77.0,987.179487,33449.171241,0.09,78.0,22.639803,23524.093124,1836.0
MIN,Full,1.007716,653.0,648.0,11.86092,2.770291,952.974036,2.791667,37574.922799,1809.0,3.0,...,214.0,7.0,77.0,36.866788,1563.716266,1.16,1147.0,25.517736,1744.826896,27017.0
MIN,diff,-0.092716,-27789.0,-26866.0,-28.298004,-21.992691,975.717422,-23.004653,-654685.66771,-899305.0,0.0,...,-115331.0,-11.0,0.0,920.047839,-4343.035265,-617.1,-753727.0,-615.97697,-207.146414,-17941402.0
25%,BiRC,1.102833,6788.0,6135.0,2.389707,9.047001,2580.042182,10.000319,82862.948134,61758.0,3.0,...,7608.0,9.0,25764.0,1077.736448,34638.081403,49.26,23867.0,28.757182,31125.878488,741653.0


In [52]:
orig_stats

Unnamed: 0,mean,50%,min,max,Total,CV%
,,,,,,
file_MB,376.642759,381.31,1.16,693.62,1362316.86,12.764588
sentences,264905.370196,252229.0,1147.0,820724.0,958162724.0,25.484893
tokens,7963980.619298,7844694.0,27017.0,19532928.0,28805717900.0,17.457456
ADV_tokens,211630.73431,207234.0,840.0,570729.0,765468366.0,18.714909
ADV_xpos,3.0,3.0,3.0,3.0,10851.0,0.0
ADV_lemmas,2373.409732,2380.0,185.0,3473.0,8584623.0,9.654474
ADV_forms,3022.908211,3076.0,195.0,3808.0,10933859.0,10.116005
ADJ_tokens,344358.620957,325172.0,1809.0,1076937.0,1245545132.0,25.906299
ADJ_xpos,3.0,3.0,3.0,3.0,10851.0,0.0


In [29]:
juxta = juxta.assign(
    corpus=juxta.index.to_series().str[:3].map(
        {'apw': 'News', 'nyt': 'News', 'pcc': 'Puddin'}).astype('category'))
juxta = juxta.reset_index().set_index(['corpus']+juxta.index.names)
juxta = juxta.sort_index().sort_index(axis=1)

In [30]:
juxta.filter(like='tokens').sum().unstack(0)

Unnamed: 0_level_0,ADJ_tokens,ADV_tokens,NEG_tokens,tokens
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BiRC,231075711,187278242,27663509,2692793781
Full,1245545132,765468366,159335611,28805717900
diff,-1014469421,-578190124,-131672102,-26112924119


In [31]:
news_orig = juxta.xs('News').filter(like='Full').droplevel('kind',axis=1)
news_stats = _stat(news_orig)
news_summary = news_stats.T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams'])
nb_display(news_summary.style.set_caption('News Summary'))


Unnamed: 0,file_MB,sentences,tokens,ADV_tokens,ADJ_tokens,bigrams
mean,329.63,375423.08,8919504.15,248527.63,514518.04,28213.03
50%,322.2,345973.5,8227311.0,246215.5,507421.5,26735.5
min,1.16,1147.0,27017.0,840.0,1809.0,77.0
max,693.62,820724.0,19532928.0,570729.0,1076937.0,69407.0
Total,121962.44,138906541.0,3300216537.0,91955223.0,190371676.0,10438820.0
CV%,42.51,46.73,46.88,47.01,41.36,53.18


In [None]:
apw_orig = news_orig.filter(like='apw', axis=0)
apw_brief = apw_orig.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams'])

apw_stats = pd.concat([apw_brief.describe(),_stat(apw_brief).T, ]).drop_duplicates()
print(apw_stats.to_latex(escape='latex'))

Unnamed: 0,file_MB,sentences,tokens,ADV_tokens,ADJ_tokens,bigrams
count,178.0,178.0,178.0,178.0,178.0,178.0
mean,301.714382,343115.561798,8121211.213483,189195.05618,470498.426966,18762.146067
std,115.137775,150303.679155,3571605.268637,68871.040949,172235.443565,7180.196888
min,1.16,1147.0,27017.0,840.0,1809.0,77.0
25%,214.525,234476.5,5533844.5,135132.25,346318.0,12910.0
50%,284.41,305206.5,7204914.0,175909.0,458215.0,17008.0
75%,402.9425,479051.75,11387051.5,256488.5,633997.75,25997.5
max,525.86,751043.0,17745454.0,317547.0,775489.0,32508.0


In [51]:
nyt_orig = news_orig.filter(like='nyt_eng_1', axis=0)
nyt_brief = nyt_orig.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams'])

nyt_stats = pd.concat([nyt_brief.describe(),_stat(nyt_brief).T, ]).drop_duplicates()
print(nyt_stats.to_latex(escape='latex'))

\begin{tabular}{lrrrrrr}
\toprule
 & file\_MB & sentences & tokens & ADV\_tokens & ADJ\_tokens & bigrams \\
\midrule
count & 65.000000 & 65.000000 & 65.000000 & 65.000000 & 65.000000 & 65.000000 \\
mean & 490.418462 & 580425.600000 & 13825478.261538 & 408590.246154 & 757593.353846 & 49383.969231 \\
std & 123.421318 & 153231.098662 & 3651586.751842 & 86518.760808 & 186439.351356 & 10112.051827 \\
min & 241.240000 & 260787.000000 & 6194247.000000 & 219127.000000 & 386135.000000 & 27368.000000 \\
25\% & 427.600000 & 513593.000000 & 12250599.000000 & 369495.000000 & 653601.000000 & 45831.000000 \\
50\% & 474.040000 & 569300.000000 & 13582626.000000 & 406979.000000 & 730488.000000 & 49716.000000 \\
75\% & 598.020000 & 713551.000000 & 16990496.000000 & 475242.000000 & 916196.000000 & 56292.000000 \\
max & 693.620000 & 820724.000000 & 19532928.000000 & 570729.000000 & 1076937.000000 & 69407.000000 \\
Total & 31877.200000 & 37727664.000000 & 898656087.000000 & 26558366.000000 & 49243568.000000

In [None]:

pudd_orig = juxta.xs('Puddin').filter(like='Full').droplevel('kind',axis=1)
pudd_stats = _stat(pudd_orig)
pudd_summary = pudd_stats.T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams'])
nb_display(pudd_summary.style.set_caption('Puddin Summary'))

nb_display((pudd_summary - news_summary).style.set_caption('Summary Discrepancy<br/>(<code>Puddin - News</code>)'))

In [38]:
save_latex_table(news_orig.describe().T.assign(total=news_orig.sum()).iloc[:,1:].T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams']),
                 caption='News Summary Statistics (\\num{370} files)',verbose=True,
                 latex_subdir='describe_corpora', latex_stem='news-summary-stats-full', 
                 default_SI='10.0'
                 )
save_latex_table(pudd_orig.describe().T.assign(total=pudd_orig.sum()).iloc[:,1:].T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams']),
                 caption='Puddin Summary Statistics (\\num{3,247} files)',verbose=True,
                 latex_subdir='describe_corpora', latex_stem='puddin-summary-stats-full', 
                 default_SI='11.0'
                 )
save_latex_table(news_stats.T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams']),
                 caption='News Summary Statistics (\\num{370} files)',verbose=True,
                 latex_subdir='describe_corpora', latex_stem='news-summary-stats-main', 
                 default_SI='10.0'
                 )
save_latex_table(pudd_stats.T.filter(['file_MB','sentences','tokens','ADV_tokens','ADJ_tokens', 'bigrams']),
                 caption='Puddin Summary Statistics (\\num{3,247} files)',verbose=True,
                 latex_subdir='describe_corpora', latex_stem='puddin-summary-stats-main', 
                 default_SI='11.0'
                 )

Caption: News Summary Statistics (\num{370} files)
*{1}{l}S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]


Unnamed: 0,file\_MB,sentences,tokens,ADV\_tokens,ADJ\_tokens,bigrams
,,,,,,
mean,329.63,375423.08,8919504.15,248527.63,514518.04,28213.03
std,140.14,175417.41,4181676.07,116829.05,212779.55,15003.57
min,1.16,1147.0,27017.0,840.0,1809.0,77.0
25\%,226.54,241787.25,5716480.0,159126.0,361820.75,15906.0
50\%,322.2,345973.5,8227311.0,246215.5,507421.5,26735.5
75\%,432.06,503977.0,11997737.75,313945.0,671683.75,38220.25
max,693.62,820724.0,19532928.0,570729.0,1076937.0,69407.0
total,121962.44,138906541.0,3300216537.0,91955223.0,190371676.0,10438820.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/news-summary-stats-full.2025-02-18.tex

Caption: Puddin Summary Statistics (\num{3,247} files)
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,file\_MB,sentences,tokens,ADV\_tokens,ADJ\_tokens,bigrams
,,,,,,
mean,382.0,252311.73,7855097.43,207426.28,324968.73,26192.66
std,7.85,5320.44,222552.2,4798.07,6512.56,652.0
min,260.26,173016.0,435945.0,138338.0,221985.0,17084.0
25\%,377.56,249219.0,7755111.5,204621.5,321259.0,25806.5
50\%,381.5,251996.0,7844206.0,207094.0,324679.0,26164.0
75\%,385.61,254840.0,7936449.5,209803.5,328024.5,26539.0
max,437.54,290337.0,9334512.0,238165.0,373124.0,30002.0
total,1240354.42,819256183.0,25505501363.0,673513143.0,1055173456.0,85047554.0


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/puddin-summary-stats-full.2025-02-18.tex

Caption: News Summary Statistics (\num{370} files)
*{1}{l}S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]
    S[table-auto-round, table-format=10.0, drop-zero-decimal]


Unnamed: 0,file\_MB,sentences,tokens,ADV\_tokens,ADJ\_tokens,bigrams
,,,,,,
mean,329.63,375423.08,8919504.15,248527.63,514518.04,28213.03
50\%,322.2,345973.5,8227311.0,246215.5,507421.5,26735.5
min,1.16,1147.0,27017.0,840.0,1809.0,77.0
max,693.62,820724.0,19532928.0,570729.0,1076937.0,69407.0
Total,121962.44,138906541.0,3300216537.0,91955223.0,190371676.0,10438820.0
CV\%,42.51,46.73,46.88,47.01,41.36,53.18


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/news-summary-stats-main.2025-02-18.tex

Caption: Puddin Summary Statistics (\num{3,247} files)
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]
    S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,file\_MB,sentences,tokens,ADV\_tokens,ADJ\_tokens,bigrams
,,,,,,
mean,382.0,252311.73,7855097.43,207426.28,324968.73,26192.66
50\%,381.5,251996.0,7844206.0,207094.0,324679.0,26164.0
min,260.26,173016.0,435945.0,138338.0,221985.0,17084.0
max,437.54,290337.0,9334512.0,238165.0,373124.0,30002.0
Total,1240354.42,819256183.0,25505501363.0,673513143.0,1055173456.0,85047554.0
CV\%,2.06,2.11,2.83,2.31,2.0,2.49


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/puddin-summary-stats-main.2025-02-18.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/describe_corpora/puddin-summary-stats-main.2025-02-18.tex')

In [None]:
nb_display(_stat(
    news_orig.filter(['file_MB', 'sentences', 'tokens',
                      'ADV_tokens', 'ADJ_tokens', 'NEG_tokens', 'bigrams'
                      ])
).filter(['Total'])
    .filter(['file_MB', 'sentences', 'tokens',
             'ADV_tokens', 'ADJ_tokens', 'NEG_tokens', 'bigrams',
             'ADV_tok_per_mill', 'ADJ_tok_per_mill', 'bigrams_per_mill',
             'bigrams_per_ksent', 'tokens_per_ksent'])
    .style.set_caption('News'))


Unnamed: 0,Total
file_MB,121962.44
sentences,138906541.0
tokens,3300216537.0
ADV_tokens,91955223.0
ADJ_tokens,190371676.0
NEG_tokens,21611437.0
bigrams,10438820.0


In [72]:
def save_totals_by_corpus(corp_stats, corp_name:str):
    save_latex_table(
    add_rate_cols(
        corp_stats.filter(['Total']).T
    ).filter(['file_MB', 'sentences', 'tokens',
            'ADV_tokens', 'ADJ_tokens', 'NEG_tokens', 'bigrams',
            'ADV_tok_per_mill', 'ADJ_tok_per_mill', 'bigrams_per_mill',
            'bigrams_per_ksent', 'tokens_per_ksent']).T, 
    verbose=True, default_SI=11.0,
    caption = f'{corp_name} Totals', latex_subdir='describe_corpora',
    latex_stem=f"{corp_name.lower().replace(' ','-')}-totals-full")
save_totals_by_corpus(pudd_stats, 'CoCra Puddin')

Caption: CoCra Puddin Totals
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,Total
,
file\_MB,1240354.42
sentences,819256183.0
tokens,25505501363.0
ADV\_tokens,673513143.0
ADJ\_tokens,1055173456.0
NEG\_tokens,137724174.0
bigrams,85047554.0
ADV\_tok\_per\_mill,26406.583169
ADJ\_tok\_per\_mill,41370.425971


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/cocra-puddin-totals-full.2025-02-18.tex



In [73]:
save_totals_by_corpus(news_stats, 'News')

Caption: News Totals
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,Total
,
file\_MB,121962.44
sentences,138906541.0
tokens,3300216537.0
ADV\_tokens,91955223.0
ADJ\_tokens,190371676.0
NEG\_tokens,21611437.0
bigrams,10438820.0
ADV\_tok\_per\_mill,27863.390771
ADJ\_tok\_per\_mill,57684.60156


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/news-totals-full.2025-02-18.tex



In [76]:
save_totals_by_corpus(pudd_stats - news_stats, 'Puddin Minus News')


Caption: Puddin Minus News Totals
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,Total
,
file\_MB,1118391.98
sentences,680349642.0
tokens,22205284826.0
ADV\_tokens,581557920.0
ADJ\_tokens,864801780.0
NEG\_tokens,116112737.0
bigrams,74608734.0
ADV\_tok\_per\_mill,26190.06802
ADJ\_tok\_per\_mill,38945.763893


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/puddin-minus-news-totals-full.2025-02-18.tex



In [78]:
save_totals_by_corpus(orig_stats, 'Puddin and News Combined')

Caption: Puddin and News Combined Totals
*{1}{l}S[table-auto-round, table-format=11.0, drop-zero-decimal]


Unnamed: 0,Total
,
file\_MB,1362316.86
sentences,958162724.0
tokens,28805717900.0
ADV\_tokens,765468366.0
ADJ\_tokens,1245545132.0
NEG\_tokens,159335611.0
bigrams,95486374.0
ADV\_tok\_per\_mill,26573.486856
ADJ\_tok\_per\_mill,43239.510167


Stylized latex table saved as:
  OverleafDissertex/assets/tables/describe_corpora/puddin-and-news-combined-totals-full.2025-02-18.tex



In [None]:
corpus_tokens_stack = juxta.stack(1).filter(like='tokens')
corpus_tokens_stack.columns.name = 'token_type'
corpus_tokens_stack = corpus_tokens_stack.stack().to_frame('count').reset_index(level=['corpus','kind', 'token_type']).sort_values(['corpus','kind', 'token_type'])
corpus_tokens_stack.value_counts(['corpus', 'kind', 'token_type'])


In [None]:
corpus_kind_token_totals=(pd.concat(
    [style_crosstab(corpus_tokens_stack,
                    ['corpus', 'kind'], ['token_type'], value_col='count', aggfunc='sum',
                    return_cross_df=True
                    ).sort_index().sort_index(axis=1),
     style_crosstab(corpus_tokens_stack,
                    ['kind'], ['token_type'], value_col='count', aggfunc='sum',
                    return_cross_df=True
                    ).assign(corpus='News+Puddin').reset_index().set_index(['corpus', 'kind']).sort_index()
     ]
).sort_index(axis=1)
)
# save_latex_table(corpus_kind_token_totals.style.format(escape='latex'), 
#                  caption='BiRC Token Count Comparisons by Corpus \& Overall', 
#                  latex_stem='birc-token-counts-by-corpus-kind',
#                  latex_subdir=REL_BIRC_TEX_DIR, verbose=True, default_SI=-11.0, 
#                  neg_color='BrickRed'
#                                         )

In [None]:
col = 'file_MB'
def build_agg_table(col:str, _juxta):
    _by_corpus = _juxta.stack(1).filter(like=col).unstack(['kind'])
    _overall = (_by_corpus.aggregate(['sum','mean', 'min', 'max'])
                .T.droplevel(0,axis=0).assign(corpus='Overall')
                .reset_index().set_index(['corpus', 'kind']))
    _by_corpus = _by_corpus.groupby('corpus', observed=True).aggregate(
        ['sum','mean', 'min', 'max']).droplevel(0, axis=1).stack('kind')
    return pd.concat((_by_corpus, _overall))

agg_df = build_agg_table('file_MB', juxta)
save_latex_table(agg_df,
                 caption='BiRC File Size (MB) Comparisons by Corpus \& Overall', 
                 latex_stem='birc-file-MB-by-corpus-and-overall',
                 latex_subdir=REL_BIRC_TEX_DIR, verbose=True, default_SI=-7.1, 
                 neg_color='BrickRed'
                 )

In [None]:
save_latex_table(build_agg_table('sentences', juxta),
                 caption='BiRC Sentence Count Comparisons by Corpus \& Overall', 
                 latex_stem='birc-sent-by-corpus-and-overall',
                 latex_subdir=REL_BIRC_TEX_DIR, verbose=True, default_SI=-9.0, 
                 neg_color='BrickRed'
                 )

In [None]:
# %%
save_latex_table(
    pd.concat([
        juxta.filter(like='apw', axis=0).sample(1), 
        juxta.filter(like='nyt', axis=0).sample(1),
        # üëÜ one sample of apw  and nyt each
        # and then 2 puddin samples
        juxta.filter(like='pcc', axis=0).sample(2)
    ]).drop_duplicates().sort_index().T,
    caption=('Sample of Juxtaposed Counts'),
    verbose=True,
    latex_subdir=REL_BIRC_TEX_DIR,
    latex_stem='birc-juxtaposed-sample-2x2')

In [None]:
def describe_and_total(_df):
    return _df.describe().T.assign(total=_df.sum()).T


by_corpus_descrip = juxta.groupby(
    'corpus', observed=True).apply(describe_and_total)
by_corpus_descrip.index.names = ['corpus', 'stat']
by_corpus_descrip.columns.names = ['obs', 'kind']

by_corpus_descrip = (
    by_corpus_descrip
    .stack(['obs', 'kind'])
    .unstack(['corpus', 'kind', 'stat'])
    .stack('stat')
    .rename(index={
            'count': '# files',
            '50%': 'median'}))
nb_display(by_corpus_descrip)


In [None]:
# save_latex_table(
#     by_corpus_descrip.xs('file_MB').style,
#     caption=('BiRC Storare Size (MB) Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-size-compare-summary'
# )
# save_latex_table(
#     by_corpus_descrip.xs('tokens').style,
#     caption=('BiRC Token Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-token-compare-summary'
# )

# save_latex_table(
#     by_corpus_descrip.xs('sentences').style,
#     caption=('BiRC Sentence Comparison by Corpus '),
#     verbose=True,
#     latex_subdir=REL_BIRC_TEX_DIR,
#     latex_stem='birc-by-corpus-sentence-compare-summary'
# )
