In [1]:
import datasets
import itertools
import copy
import pandas as pd
import numpy as np
import math
import os
import sys

try:
    from tqdm.notebook import tqdm
except:
    from tqdm import tqdm

from utils import ProjectManager
PM = ProjectManager.ProjectManager()  # Instantiate the ProjectManager
from utils.my_latex_tools import *

# Define keys for word and sentence level features
word_level_feature_keys = ['words', 'pos_tags', 'predicate_lemmas', 'predicate_framenet_ids', 'word_senses', 'named_entities']
sentence_level_feature_keys = ['part_id', 'parse_tree', 'speaker']
non_feature_keys = ['srl_frames', 'coref_spans']

# Load word-level and sentence-level features from ProjectManager
sentence_features = PM.load_dataset("sentence_features")
word_features = PM.load_dataset("word_features")

# Display first few rows of loaded datasets
sentence_features.head()
word_features.head()

Unnamed: 0,sentence_idx,part_idx,document_idx,document_id,part_id,parse_tree,speaker,sentences
0,0,0,0,bc/cctv/00/cctv_0001,0,(TOP(SBARQ(WHNP(WHNP (WP What) (NN kind) )(PP...,Speaker#1,"['What', 'kind', 'of', 'memory', '?']"
1,1,1,0,bc/cctv/00/cctv_0001,0,(TOP(S(NP (PRP We) )(ADVP (RB respectfully) )(...,Speaker#1,"['We', 'respectfully', 'invite', 'you', 'to', ..."
2,2,2,0,bc/cctv/00/cctv_0001,0,(TOP(NP(NP(NP (NNP WW) (NNP II) (NNPS Landma...,Speaker#1,"['WW', 'II', 'Landmarks', 'on', 'the', 'Great'..."
3,3,3,0,bc/cctv/00/cctv_0001,0,(TOP(SINV(VP (VBG Standing) (S(ADJP (JJ tall) ...,Speaker#1,"['Standing', 'tall', 'on', 'Taihang', 'Mountai..."
4,4,4,0,bc/cctv/00/cctv_0001,0,(TOP(S(NP (PRP It) )(VP (VBZ is) (VP (VBN comp...,Speaker#1,"['It', 'is', 'composed', 'of', 'a', 'primary',..."


Unnamed: 0,word_order,word_idx,sentence_idx,words,pos_tags,predicate_lemmas,predicate_framenet_ids,word_senses,named_entities,pos_names,...,POS_7,POS_51_id,POS_12_id,POS_7_id,is_in_POS_6,function,tree_depth,unigram_probs,bigram_probs,trigram_probs
0,0,0,0,What,48,,,,0,WP,...,Noun,48,2,0,True,0,5,7.784044,14.373151,14.373151
1,1,1,0,kind,25,,,,0,NN,...,Noun,25,1,0,True,0,5,8.019463,11.305098,14.635171
2,2,2,0,of,18,,,,0,IN,...,Adposition,18,5,2,True,1,5,3.770704,8.346078,11.567118
3,3,3,0,memory,25,memory,,1.0,0,NN,...,Noun,25,1,0,True,0,6,10.404441,12.986857,14.635171
4,4,4,0,?,8,,,,0,.,...,X,8,11,6,False,2,3,6.262053,14.373151,14.635171


In [2]:
# Extract different POS tag representations from the word_features dataset
POS_51 = word_features["POS_51"]  # POS tags with 51 distinct categories
POS_12 = word_features["POS_12"]  # POS tags with 12 distinct categories
POS_7 = word_features["POS_7"]    # POS tags with 7 distinct categories

# Filter the dataset for entries that are in the POS-6 category
POS_6 = word_features[word_features["is_in_POS_6"] == True].rename(columns={"POS_7": "POS_6"})["POS_6"]

def create_tex_pos_info_for_raw_data_appendix(series, title="title", fn='test_df', label="", v=0):
    """
    Create a LaTeX table for the distribution of POS categories in the dataset.
    
    Args:
        series (pd.Series): The series containing POS data.
        title (str): The title of the LaTeX table.
        fn (str): The filename for saving the LaTeX table.
        label (str): The label for referencing the table in a LaTeX document.
        v (int): Verbosity level, 1 for printing additional information.
    
    Outputs:
        LaTeX table saved to disk.
    """
    # Get the total count of POS tags
    total = series.count()

    # Create a DataFrame with proportions of each POS tag category
    df = (series.value_counts(normalize=True) * 100).to_frame().reset_index()
    df.columns = ["POS", "Proportion"]

    # Update the title with the total count
    title += f" (n={human_format(total)})"

    # Save the DataFrame as a LaTeX table using the custom utility
    save_df_as_tex(df, fn=fn, caption=title, label=fn, v=0, index=False, float_format="%.2f", escape=True)
    
    return df

# Generate LaTeX tables for POS-51, POS-12, POS-7, and POS-6 tag distributions
create_tex_pos_info_for_raw_data_appendix(POS_51,title="Categorical statistics for POS-51",fn='pos_51_stats')
create_tex_pos_info_for_raw_data_appendix(POS_12,title="Categorical statistics for POS-12",fn='pos_12_stats')
create_tex_pos_info_for_raw_data_appendix(POS_7,title="Categorical statistics for POS-7",fn='pos_7_stats')
create_tex_pos_info_for_raw_data_appendix(POS_6,title="Categorical statistics for POS-6",fn='pos_6_stats')


\begin{table}
    \centering
    \input{../inputs/tbl/pos_51_stats}
    \caption{caption for tbl:pos 51 stats}
    \label{tbl:pos_51_stats}
    \end{table} % \ref{tbl:pos_51_stats}


Unnamed: 0,POS,Proportion
0,NN,12.131993
1,IN,10.324328
2,DT,8.439498
3,NNP,7.502767
4,JJ,5.413271
5,NNS,5.038776
6,.,4.646407
7,XX,4.618881
8,",",4.227572
9,RB,4.113852


\begin{table}
    \centering
    \input{../inputs/tbl/pos_12_stats}
    \caption{caption for tbl:pos 12 stats}
    \label{tbl:pos_12_stats}
    \end{table} % \ref{tbl:pos_12_stats}


Unnamed: 0,POS,Proportion
0,NOUN,24.932412
1,VERB,15.380212
2,.,10.816527
3,ADP,10.324328
4,DET,9.138336
5,X,6.200497
6,ADJ,5.814414
7,PRON,5.605362
8,ADV,4.666182
9,CONJ,2.848502


\begin{table}
    \centering
    \input{../inputs/tbl/pos_7_stats}
    \caption{caption for tbl:pos 7 stats}
    \label{tbl:pos_7_stats}
    \end{table} % \ref{tbl:pos_7_stats}


Unnamed: 0,POS,Proportion
0,Noun,30.537774
1,X,24.138753
2,Verb,15.380212
3,Adposition,10.324328
4,Determiner,9.138336
5,Adjective,5.814414
6,Adverb,4.666182


\begin{table}
    \centering
    \input{../inputs/tbl/pos_6_stats}
    \caption{caption for tbl:pos 6 stats}
    \label{tbl:pos_6_stats}
    \end{table} % \ref{tbl:pos_6_stats}


Unnamed: 0,POS,Proportion
0,Noun,40.254775
1,Verb,20.274136
2,Adposition,13.609489
3,Determiner,12.046119
4,Adjective,7.664538
5,Adverb,6.150943
