In [1]:
try:
    import davos
except:
    %pip install davos
    import davos
davos.config.suppress_stdout = True

In [2]:
smuggle requests                                            # pip: requests==2.28.2
from tqdm smuggle tqdm                                      # pip: tqdm==4.65.0
smuggle pandas as pd                                        # pip: pandas==2.0.1
smuggle numpy as np                                         # pip: numpy==1.25.2
smuggle seaborn as sns                                      # pip: seaborn==0.12.2
from matplotlib smuggle pyplot as plt                       # pip: matplotlib==3.7.1
from IPython.display smuggle Markdown
smuggle openpyxl                                            # pip: openpyxl==3.1.2
smuggle contractions                                        # pip: contractions==0.1.73
from nltk.tokenize smuggle word_tokenize, sent_tokenize     # pip: nltk==3.8.1
from nltk smuggle pos_tag
from scipy.stats import ttest_1samp                         # pip: scipy==1.11.2

smuggle nltk
nltk.download('averaged_perceptron_tagger', quiet=True)

smuggle re
smuggle os
smuggle urllib
smuggle json
smuggle string
smuggle warnings
smuggle pickle
smuggle zipfile
from glob smuggle glob as lsdir
from collections import defaultdict
from pathlib smuggle Path

# Predicting references to past and future events

At a high level, the goal of this meta analysis is to predict in-text references to past and future events.  Manually identifying these references is labor and time intensive, so it is impractical to scale up manual tagging to millions of documents.  Instead, we've defined some heuristics for *predicting* when text is referring to real or hypothetical past or future events.  Our approach comprises four main steps:

1. First we use the `nltk` package to segment each document into individual sentences. Each sentence is processed independently of the others.
2. Next, we handle contractions using the `contractions` package (e.g., "we'll" gets split into "we will," and so on).
3. Third, we define two sets of "keywords" (words and phrases) that tend to be indicative of referring to the past (`past_keywords.txt`) or future (`future_keywords.txt`).  We used ChatGPT (`gpt-4`) to generate each list, with exactly 50 templates per list, using the following prompt:
```
I'm designing a heuristic algorithm for identifying references (in text) to past and future events. Part of the algorithm will involve looking for specific keywords or phrases that suggest that the text is referring to something that happened (or will happen) in the past and/or future. Could you help me generate a list of 50 keywords or phrases to include in each list (one list for identifying references to the past and a second list for identifying references to the future)? I'd like to be able to paste the lists you generate into two plain text documents with one row per keyword or phrase, and no other content. Please output the lists as a "code" block (enclosed by ```...```).
```
4. Finally, we use part-of-speech tagging (using the `nltk` package) to look for verbs or verb phrases that are in past or future tenses. After the words are tagged with their predicted parts of speech, we use regular expressions (applied to the sequences of tags) to label each verb or verb phrase with a human readable verb form (e.g., "future perfect continuous passive," "conditional perfect continuous passive," and so on).

We treat each keyword match (of past or future keywords) as a single "reference" (to a past or future event, respectively), and if any past or future verb forms are detected we treat those as (up to) one additional reference.  We then tally up the numbers of past and/or future references across sentences within the document.

The `process_folder` function returns two things:
  - `df_results` is a DataFrame with one row per document (index), and the following columns:
    - `Past`: the number of references to past events identified in the document
    - `Future`: the number of references to future events identified in the document
  - `sent_results` is a dictionary whose keys are filenames of .txt files in the given folder, and whose values are DataFrames with one row per sentence in the given document.  The per-document DataFrames have the following columns:
    - `content`: the text of the given sentence
    - `past`: the number of references to past events identified in the given sentence
    - `future`: the number of references to future events identified in the given sentence

In the metaanalysis reported in our paper, we only use results from the `df_results` DataFrames.  However, the `sent_results` dictionaries are useful for spot-checking how the heuristics are working, and for digging into results for any given document(s).

Running the `process_folder` function can take a long time if there are many documents to process.  We save out the results as pickle files after running the function for the first time on a given directory so that the analysis only needs to be run one time per folder.

In [59]:
def load_keywords(filename):
    with open(filename, 'r') as f:
        return set(line.strip() for line in f)


def handle_contractions(sentence):
    return contractions.fix(sentence)


def sentence_tense(x):
  # source: https://stackoverflow.com/questions/30016904/determining-tense-of-a-sentence-python
  def tense_detect(tagged_sentence):        
    verb_tags = ['MD','MDF',
                'BE','BEG','BEN','BED','BEDZ','BEZ','BEM','BER',
                'DO','DOD','DOZ',
                'HV','HVG','HVN','HVD','HVZ',
                'VB','VBG','VBN','VBD','VBZ',
                'SH',
                'TO',                
                'JJ']
    
    verb_phrase = []
    for item in tagged_sentence:
        if item[1] in verb_tags:
            verb_phrase.append(item)

    grammar = r'''
            future perfect continuous passive:     {<MDF><HV><BEN><BEG><VBN|VBD>+}
            conditional perfect continuous passive:{<MD><HV><BEN><BEG><VBN|VBD>+}
            future continuous passive:             {<MDF><BE><BEG><VBN|VBD>+}   
            conditional continuous passive:        {<MD><BE><BEG><VBN|VBD>+}    
            future perfect continuous:             {<MDF><HV><BEN><VBG|HVG|BEG>+}   
            conditional perfect continuous:        {<MD><HV><BEN><VBG|HVG|BEG>+}
            past perfect continuous passive:       {<HVD><BEN><BEG><VBN|VBD>+}
            present perfect continuous passive:    {<HV|HVZ><BEN><BEG><VBN|VBD>+}
            future perfect passive:                {<MDF><HV><BEN><VBN|VBD>+}   
            conditional perfect passive:           {<MD><HV><BEN><VBN|VBD>+}    
            future continuous:                     {<MDF><BE><VBG|HVG|BEG>+ }   
            conditional continuous:                {<MD><BE><VBG|HVG|BEG>+  }   
            future indefinite passive:             {<MDF><BE><VBN|VBD>+ }
            conditional indefinite passive:        {<MD><BE><VBN|VBD>+  }
            future perfect:                        {<MDF><HV><HVN|BEN|VBN|VBD>+ }   
            conditional perfect:                   {<MD><HV><HVN|BEN|VBN|VBD>+  }   
            past continuous passive:               {<BED|BEDZ><BEG><VBN|VBD>+}  
            past perfect continuous:               {<HVD><BEN><HVG|BEG|VBG>+}   
            past perfect passive:                  {<HVD><BEN><VBN|VBD>+}
            present continuous passive:            {<BEM|BER|BEZ><BEG><VBN|VBD>+}   
            present perfect continuous:            {<HV|HVZ><BEN><VBG|BEG|HVG>+}    
            present perfect passive:               {<HV|HVZ><BEN><VBN|VBD>+}
            future indefinite:                     {<MDF><BE|DO|VB|HV>+ }       
            conditional indefinite:                {<MD><BE|DO|VB|HV>+  }   
            past continuous:                       {<BED|BEDZ><VBG|HVG|BEG>+}           
            past perfect:                          {<HVD><BEN|VBN|HVD|HVN>+}
            past indefinite passive:               {<BED|BEDZ><VBN|VBD>+}   
            present indefinite passive:            {<BEM|BER|BEZ><VBN|VBD>+}            
            present continuous:                    {<BEM|BER|BEZ><BEG|VBG|HVG>+}            
            present perfect:                       {<HV|HVZ><BEN|HVD|VBN|VBD>+  }       
            past indefinite:                       {<DOD><VB|HV|DO>|<BEDZ|BED|HVD|VBN|VBD>+}        
            infinitive:                            {<TO><BE|HV|VB>+}
            present indefinite:                    {<DO|DOZ><DO|HV|VB>+|<DO|HV|VB|BEZ|DOZ|BER|HVZ|BEM|VBZ>+}    
            '''

    if len(verb_phrase) > 0:
      cp = nltk.RegexpParser(grammar)
      result = cp.parse(verb_phrase)
    else:
      result = []
    
    tenses_set = set()
    for node in result:
      if type(node) is nltk.tree.Tree:
        tenses_set.add(node.label())
    
    return tenses_set
    
  text = word_tokenize(x)
  tagged = pos_tag(text)
  return tense_detect(tagged)


def analyze_sentence(sentence, past_keywords, future_keywords):
    past_count = 0
    future_count = 0

    sentence = handle_contractions(sentence)
    
    # Check for past and future keywords
    past_kw_found = any(keyword in sentence for keyword in past_keywords)
    future_kw_found = any(keyword in sentence for keyword in future_keywords)

    # Count up to one past and/or future reference based on keywords
    past_count += int(past_kw_found)
    future_count += int(future_kw_found)
    
    # Also look at tenses
    tenses = sentence_tense(sentence)
    if any(['past' in x for x in tenses]):
        past_count += 1
    if any(['future' in x for x in tenses]) or any(['conditional indefinite' in x for x in tenses]):
        future_count += 1

    return past_count, future_count


def process_folder(folder_path, past_keywords, future_keywords):    
    # Dictionary to store results
    results_dict = defaultdict(lambda: {"Past": 0, "Future": 0})
    sentence_dfs = {}

    for file_name in tqdm(os.listdir(folder_path)):
        if file_name.endswith(".txt"):
            with open(os.path.join(folder_path, file_name), 'r', encoding="utf-8") as f:
                content = f.read()
                sentences = nltk.sent_tokenize(content)

                # Dataframe to store results for each sentence in the current file
                df = pd.DataFrame(columns=["content", "past", "future"])

                for sentence in sentences:
                    past_count, future_count = analyze_sentence(sentence, past_keywords, future_keywords)
                    results_dict[file_name]["Past"] += past_count
                    results_dict[file_name]["Future"] += future_count
                    df = df._append({"content": sentence, "past": past_count, "future": future_count}, ignore_index=True)

                sentence_dfs[file_name] = df

    df_results = pd.DataFrame(results_dict).T
    
    return df_results, sentence_dfs

## Keywords that reflect past events

In [4]:
past_keywords = load_keywords(str(Path.cwd().parent.joinpath('data', 'past_keywords.txt')))
Markdown('**Past keywords and phrases:**\n\n' + ' | '.join(past_keywords))

**Past keywords and phrases:**

final | so far | used to be | to date | had | made | last night | long ago | already | last season | concluded | were | once | previously | last month | ceased | earlier | in the past | before | said | up to now | heretofore | last year | wrote | terminated | last semester | yesteryear | was | antiquity | last time | since | in those days | did | thus far | back when | last quarter | ago | formerly | elapsed | olden days | yesterday | recently | once upon a time | then | expired | hitherto | used to | historically | last week | bygone

## Keywords that reflect future events

In [5]:
future_keywords = load_keywords(str(Path.cwd().parent.joinpath('data', 'future_keywords.txt')))
Markdown('**Future keywords and phrases:**\n\n' + ' | '.join(future_keywords))

**Future keywords and phrases:**

to be | prospective | futuristic | next time | tomorrow | on the horizon | imminently | next quarter | forthcoming | soon | next year | next season | could | subsequent | impending | can | down the line | in time | eventual | later on | going to | predicted | may | in the future | some day | might | succeeding | anticipated | shall | next week | looming | scheduled to | later | in the cards | intend to | eventually | hereafter | upcoming | after | will | next semester | shortly | in the works | next month | plan to

# Download the data

We'll examine 12 datasets that span several broad categories of documents: *film* (transcripts of movies or excerpts of transcripts from movies), *television* (transcripts of television shows or excerpts of transcripts from television shows), *speech* (transcripts of spoken communication), and *text* (written works or conversations that took place using text-based media).

The datasets are summarized in the DataFrame below:

In [6]:
data_list = Path.cwd().parent.joinpath('data', 'metaanalysis-datasets.xlsx')
data = pd.read_excel(data_list)
data

Unnamed: 0,Dataset,Short name,Data URL,Source URL,Results URL,Description,Category,Number of observations,Observation type,Number of words
0,Internet Movie Script Database,IMSDb,https://www.dropbox.com/scl/fi/ct39vqqq9sjqyyh...,https://imsdb.com,https://www.dropbox.com/scl/fi/3gq5ieq7l25719i...,A collection of transcripts from roughly 1000 ...,Film,1091,Transcript,26023348
1,Movie Dialogues Dataset,Movies,https://www.dropbox.com/s/881yuhil48v6q1n/movi...,https://convokit.cornell.edu/documentation/mov...,https://www.dropbox.com/scl/fi/arxkyhub2fi6qh5...,A large collection of fictional conversations ...,Film,304713,Utterance,3209921
2,Switchboard Dialog Act Corpus,Switchboard,https://www.dropbox.com/s/qvx4211u41l2ex4/swit...,https://convokit.cornell.edu/documentation/swi...,https://www.dropbox.com/scl/fi/1o7wqdlc1oo26y6...,A collection of five-minute telephone conversa...,Speech,122646,Utterance,2052779
3,Supreme Court Corpus,SCOTUS,https://www.dropbox.com/s/icxk3ubo2u2brzq/supr...,https://convokit.cornell.edu/documentation/sup...,https://www.dropbox.com/scl/fi/zxkvlrg4lfxcv7c...,A collection of cases from the U.S. Supreme Co...,Speech,1700789,Utterance,71889094
4,Tennis Interviews,Tennis,https://www.dropbox.com/s/q7bfirllnu32mao/tenn...,https://convokit.cornell.edu/documentation/ten...,https://www.dropbox.com/scl/fi/d3g83mtz4mqhbpm...,Transcripts for tennis singles post-match pres...,Speech,163948,Utterance,7043118
5,Persuasion for Good Corpus,PfG,https://www.dropbox.com/scl/fi/ei7uxv9husg9noj...,https://convokit.cornell.edu/documentation/per...,https://www.dropbox.com/scl/fi/zmumd8uno58cqzo...,A collection of online conversations generated...,Speech,20932,Utterance,351759
6,Intelligence Squared Debates Corpus,IQ2,https://www.dropbox.com/scl/fi/srg1j0m4rhgoqhl...,https://convokit.cornell.edu/documentation/iq2...,https://www.dropbox.com/scl/fi/3d4eha6r6xop7h0...,This dataset contains transcripts of debates h...,Speech,26562,Utterance,1898509
7,Group Affect and Performance Corpus,GAP,https://www.dropbox.com/scl/fi/j1zh1pey7m8kcyr...,https://convokit.cornell.edu/documentation/gap...,https://www.dropbox.com/scl/fi/prk03sodn4pg895...,Group members completed a Winter Survival Task...,Speech,8009,Utterance,45989
8,The Chair,Chair,https://www.dropbox.com/scl/fi/9cpj3t1n1ktxghu...,https://scrapsfromtheloft.com/?s=THE+CHAIR,https://www.dropbox.com/scl/fi/to0642t939pvrtz...,"Scraped transcripts from The Chair, Season 1.",Television,6,Transcript,19197
9,Friends Corpus,Friends,https://www.dropbox.com/s/nfaa6ap0ws1rqjy/frie...,https://convokit.cornell.edu/documentation/fri...,https://www.dropbox.com/scl/fi/mkxc114g90rifsm...,A collection of all the conversations that occ...,Television,67373,Utterance,622894


# Detect past and future events

For each dataset in the `data` DataFrame, we'll:
  - Download and extract the dataset if it doesn't already exist locally
  - Check to see whether the metaanalysis has already been run on that folder.  If not, we'll run the `process_folder` function on the dataset's directory and save the results

In [7]:
def get_folder_name(url):
    return [s for s in url.split('/') if '.zip' in s][0].split('?')[0][:-4]

def download_dataset(url, outdir):
    # Download dataset
    filename = get_folder_name(url) + '.zip'
    x = requests.get(url)

    with open(filename, 'wb') as f:
        f.write(x.content)
    
    # Unzip dataset
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(outdir)

    # Delete zip file
    os.remove(filename)

In [115]:
datadir = Path.cwd().parent.joinpath('data')
results = []
sentence = []

# should we just download the already-completed results or compute them from scratch?
force_rerun = False

for i, row in data.iterrows():
# for i, row in data[9:10].iterrows():
    print('Processing dataset: ' + row['Dataset'])
    results_fname = datadir.joinpath(row['Short name'].lower() + '_results.pkl')
    if not results_fname.exists():
        if force_rerun:
            # check whether the dataset exists locally and has at least 5 .txt files
            next_datadir = datadir.joinpath(get_folder_name(row['Data URL']))
            if not (next_datadir.exists() and len(lsdir(str(next_datadir.joinpath('*.txt')))) >= 5):
                # download the dataset
                download_dataset(row['Data URL'], datadir)
            
            # process the dataset
            df_results, sentence_dfs = process_folder(next_datadir, past_keywords, future_keywords)
            with open(results_fname, 'wb') as f:
                pickle.dump([df_results, sentence_dfs], f)
        else:
            x = requests.get(row['Results URL'])
            with open(results_fname, 'wb') as f:
                f.write(x.content)
    
    with open(results_fname, 'rb') as f:
        next_results, sentence_dfs = pickle.load(f)
    
    next_results = next_results.reset_index().rename(columns={"index": "filename"}).melt(id_vars=["filename"],          
                                        var_name="tense", value_name="count")
    next_results['proportion'] = next_results['count'] / next_results.groupby('filename')['count'].transform('sum')
    next_results['Dataset'] = row['Short name']

    results.append(next_results)
    sentence.append(sentence_dfs)
results = pd.concat(results)

Processing dataset: Internet Movie Script Database
Processing dataset: Movie Dialogues Dataset
Processing dataset: Switchboard Dialog Act Corpus
Processing dataset: Supreme Court Corpus
Processing dataset: Tennis Interviews
Processing dataset: Persuasion for Good Corpus
Processing dataset: Intelligence Squared Debates Corpus
Processing dataset: Group Affect and Performance Corpus
Processing dataset: The Chair
Processing dataset: Friends Corpus
Processing dataset: Gutenberg Dialogue Dataset
Processing dataset: Reddit Corpus


## the Chair

In [None]:
results = []
d = 0

for key in sentence[d].keys():
    res = {}
    # res['dataset'] = data['Short name'][d]
    # res['n_files'] = len(sentence[d])
    res['episode'] = key
    res['n_sentences'] = len(sentence[d][key])
    res['n_past_refs_c'] = sentence[d][key]['past'].astype(bool).sum()
    res['n_future_refs_c'] = sentence[d][key]['future'].astype(bool).sum()
    print(res)
    results.append(res)
    
# pd.DataFrame(results).to_csv("../data/the_chair/the_chair_auto_reference_counts.csv", index=False)

## all datasets

In [None]:
results = []

for d in range(len(sentence)):
    res = {}
    res['dataset'] = data_filter['Short name'][d]
    res['n_files'] = len(sentence[d])
    res['n_sentences'] = sum([len(sentence[d][key]) for key in sentence[d].keys()])
    res['n_past_refs_c'] = sum([sentence[d][key]['past'].astype(bool).sum() for key in sentence[d].keys()])
    res['n_future_refs_c'] = sum([sentence[d][key]['future'].astype(bool).sum() for key in sentence[d].keys()])
    results.append(res)
    
# pd.DataFrame(results).to_csv("../data/ref_counts_summary.csv", index=False)

# figure S12

In [None]:
from bokeh.palettes import Category20c

In [None]:
auto = pd.read_csv("../data/the_chair/the_chair_auto_reference_counts.csv")
manual = pd.read_csv("../data/the_chair/the_chair_manual_reference_counts.csv")

auto['auto_ratio'] = auto['Past']/auto['Future']
manual['manual_ratio'] = manual['Past']/manual['Future']
auto_long = auto[['Episode','Past','Future']].melt(var_name='Direction', value_name='auto_count', id_vars=['Episode'])
manual_long = manual[['Episode','Past','Future']].melt(var_name='Direction', value_name='manual_count', id_vars=['Episode'])
auto_long['auto_proportion'] = auto_long['auto_count'] / auto_long.groupby('Episode')['auto_count'].transform('sum')
manual_long['manual_proportion'] = manual_long['manual_count'] / manual_long.groupby('Episode')['manual_count'].transform('sum')

count_all_long = manual_long.merge(auto_long, on=['Episode','Direction'])
count_all = manual.merge(auto, on=['Episode'])
count_all['manual_prop'] = count_all['Past_x']/(count_all['Past_x']+count_all['Future_x'])
count_all['auto_prop'] = count_all['Past_y']/(count_all['Past_y']+count_all['Future_y'])

In [None]:
bar_count = alt.Chart().mark_bar(color=Category20c[20][18]).encode(
    x='Episode:O',
    y=alt.Y('auto_count', title="Number of references"),
    # column='direction'
).properties(
    width=alt.Step(30)  # controls width of bar.
)
tick_count = alt.Chart().mark_tick(color='#8C6238', thickness=2,).encode(
    x='Episode:O',
    y='manual_count',
    # column='Direction',
)
count_plot = alt.layer(bar, tick, data=count_all_long).facet(column=alt.Column('Direction', sort="descending",  title='', header=alt.Header(labelOrient='top', titleFontSize=14, labelFontSize=14, titleFontWeight='normal', titlePadding=0))).properties(title='A')

bar_prop = alt.Chart(count_all).mark_bar(color=Category20c[20][18]).encode(
    x='Episode:O',
    y=alt.Y('auto_prop', scale=alt.Scale(domain=[0,1]), axis=alt.Axis(format='%'), title="Past / (Past + Future) %"),
).properties(
    width=alt.Step(30)  # controls width of bar.
)

tick_prop = alt.Chart(count_all).mark_tick(color='#8C6238', thickness=2,).encode(
    x='Episode:O',
    y=alt.Y('manual_prop', scale=alt.Scale(domain=[0,1]), axis=alt.Axis(format='%')),
)
prop_plot = (bar_prop+tick_prop).properties(title='B')
bar_ratio = alt.Chart(count_all).mark_bar(color=Category20c[20][18]).encode(
    x='Episode:O',
    y=alt.Y('auto_ratio', scale=alt.Scale(domain=[1,6], type="log", base=2), title="Past / Future Ratio (log scale)"),
).properties(
    width=alt.Step(30)  # controls width of bar.
)

tick_ratio = alt.Chart(count_all).mark_tick(color='#8C6238', thickness=2,).encode(
    x='Episode:O',
    y=alt.Y('manual_ratio', scale=alt.Scale(domain=[1,6], type="log", base=2)),
)

ratio_plot = (bar_ratio+tick_ratio).properties(title='C')

(count_plot | prop_plot | ratio_plot
).configure_legend(

).configure_axis(
    titleFontSize=14,
    labelFontSize=14,
    titleFontWeight='normal',
    labelFontWeight='normal',
).configure_concat(
    spacing=50
).configure_title(
    fontSize=20,
    anchor='start',
#     offset=20
)

# stats

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

In [None]:
df = pd.read_csv("ref_counts_summary.csv")
df

Unnamed: 0,dataset,type,source,full,non-empty,is_equal,past,future,total,corrected_past,corrected_future,past_prop,future_prop,RR,non_past,non_future
0,IMSDb,Scripted,,1091,1091,True,833026,472519,3080674,657475,316525,0.213419,0.102745,2.077166,2423199,2764149
1,Movies,Scripted,ConvoKit,304713,304446,False,179729,129622,516163,127744,85937,0.247488,0.166492,1.486484,388419,430226
2,Switchboard,Spontaneous,ConvoKit,122646,122646,True,62464,32372,245461,41488,22079,0.169021,0.089949,1.879071,203973,223382
3,SCOTUS,Constrained,ConvoKit,1700789,1700789,True,3089509,1802239,3880259,1963578,1207377,0.506043,0.311159,1.626317,1916681,2672882
4,Tennis,Constrained,ConvoKit,163948,163948,True,448444,193802,599172,281669,134638,0.470097,0.224707,2.092047,317503,464534
5,PfG,Constrained,ConvoKit,20932,20932,True,9695,15520,37184,7408,9771,0.199225,0.262774,0.758162,29776,27413
6,IQ2,Constrained,ConvoKit,26562,26317,False,67626,51780,122925,46630,34811,0.379337,0.283189,1.339519,76295,88114
7,GAP,Constrained,ConvoKit,8009,8009,True,2739,1958,8009,1800,1338,0.224747,0.167062,1.345291,6209,6671
8,Chair,Scripted,,6,6,True,909,663,2900,660,460,0.227586,0.158621,1.434783,2240,2440
9,Friends,Scripted,ConvoKit,67373,61310,False,32105,23931,107082,22067,16356,0.206076,0.152743,1.349169,85015,90726


In [None]:
total = df['corrected_past'].sum() + df['corrected_future'].sum()
total

24040006

In [None]:
df['corrected_past'].sum()

13471984

In [None]:
df['corrected_past'].sum() / total

0.5603985290186699

In [None]:
df['corrected_future'].sum()

10568022

In [None]:
df['corrected_future'].sum() / total

0.43960147098133

In [None]:
for i in df.iterrows():
    print(i[1]['dataset'])
    stat, p, dof, expected = chi2_contingency([[i[1]['corrected_past'], i[1]['non_past']], [i[1]['corrected_future'], i[1]['non_future']]])
    print(p)

IMSDb
0.0
Movies
0.0
Switchboard
0.0
SCOTUS
0.0
Tennis
0.0
PfG
7.651987244822489e-94
IQ2
0.0
GAP
4.415379561347529e-20
Chair
3.6004633077240275e-11
Friends
7.023120502011673e-227
Gutenberg
0.0
Reddit
0.0
