In [1]:
#Exploratory notebook

In [2]:
#TODO: Modify this path 

text_volume_path = './1789to1824_DebatesAndProceedings/text_volumes'
speeches_path = './1789to1824_DebatesAndProceedings/speeches'
tuples_path = './1789to1824_DebatesAndProceedings/df_tuples'
bigrams_path = './1789to1824_DebatesAndProceedings/bigrams'


In [7]:
import os
import numpy as np
import pandas as pd

def list_all_extension_files(directory_path, extension='.txt'):
    """

    List all the files in the directory directory_path that have .<extension> as extension.
    :param directory_path: str
    :return:
    """
    files_paths = []
    for r, d, f in os.walk(directory_path):
        f = [os.path.join(r,file) for file in f if file.find(extension) != -1]
        files_paths.append(f)

    files_paths = list(np.hstack(files_paths))
    return files_paths


def parse_path(df, path_name, extension, page=True):
    if page:
        df['page'] = df[path_name].str.rsplit("/",n=3, expand=True)[3]
        df['page'] = df.page.str.replace(extension,"").str.replace("page_","").astype(int)
    
    df['isAppendix'] = df[path_name].str.rsplit("/",n=3, expand=True)[2]=='Appendix'
    appendixes = df[df['isAppendix']==True].index
    non_appendixes = df[df['isAppendix']==False].index
    df.loc[appendixes, 'volume'] = df[path_name].str.rsplit("/",n=3, expand=True)[1]
    df.loc[non_appendixes, 'volume'] = df[path_name].str.rsplit("/",n=3, expand=True)[2]
    df['volume'] = df['volume'].str.replace("Volume_","").astype(int)

    return df


In [10]:
def read_text_volumes(text_volume_path):
    text_volumes = list_all_extension_files(text_volume_path, extension='.txt')
    df_text = pd.DataFrame(data=text_volumes, columns=['path'])
    df_text = parse_path(df_text, path_name='path', extension='.txt')
    df_text['text'] = df_text.path.apply(lambda x: open(x, "r").read() )
    
    return df_text


# The directory <text_volumes> contains all the OCRed text. It has as many files as the number of pages
# that were scraped from the web.

# The dataframe "df_text" contains one row per each path of the "text_volumes" subfiles. Each
# path corresponds to a specific Volume/Appendix/page file. Each file is a .txt that is read and 
# put into the column "text"

df_text = read_text_volumes(text_volume_path)

In [None]:
def read_speeches(speeches_path, n_max):
    speeches_paths = list_all_extension_files(speeches_path, extension='.csv')[:n_max]
    speeches = pd.DataFrame()
    for path in speeches_paths:
        try:
            speeches = pd.concat([speeches, pd.read_csv(path)])
            speeches['path'] = path
            speeches = parse_path(speeches, 'path', extension='.csv')
        except:
            #print(f"File {path} not parsed")
            pass
    return speeches

# The directory <speeches_path> contains the path to the csv file containing the info about the speeches said
# by each congressmen in a given page. It has as many files as the number of OCRed pages.
# However, since this leads to huge results, a n_max is given to take only few pages...

# In the dataframe "speeches", there is one row per each congressmen in a particular 
# Volume/Appendix/page combination.
# The column 'name' represents the congressmen that said the speeches, the speeches are in all the columns
# represented by an integer. You can observe that there are some NaN, this is due to the fact
# that the function above concatenates pd.DataFrame that may have different number of columns
speeches = read_speeches(speeches_path, n_max=50)
speeches.head()

In [25]:
def read_tuple_file(path):
    
    df_tuple = pd.read_csv(path)
    df_tuple = df_tuple.rename(columns={'Unnamed: 0':'word0', 'Unnamed: 1':'word1'})
    return df_tuple

# The directory <tuples_path> contains one file per each Volume/Appendix combination, which means
# that there are less files but bigger dataframe, hence, in the aboce cell, I only upload one
paths = list_all_extension_files(tuples_path, extension='.csv')


# The dataframe df_tuples represent the file containg the tuples of words in each Volume/Appendix
df_tuples = pd.DataFrame(data=paths, columns=['path'])
df_tuples['volume'] = df_tuples.path
df_tuple = parse_path(df_tuples, path_name='path', extension='.csv', page=False)

#dataframe associated to the first path...
tuple_path_1 = df_tuple[((df_tuple.volume==6)&(df_tuple.isAppendix==False))].path[0]

# In the dataframe "df_tuple_1", the columns word0 and word1 are the bigram extracted from the text,
# the columns represent each congressmen, while the value in the column represent how many times 
# that congressmen used that bigram (w0,w1) in the Volume/Appendix associated to tuple_path_1
df_tuple_1 = read_tuple_file(tuple_path_1)


In [26]:
def read_congress(path_c):
    congress = pd.read_csv(path_c)
    congress = congress.rename(columns={'Unnamed: 0':'congressmen'})
    
    return congress

# The directory <bigrams_path> contains one file per each congress combination.
# There are less congresses than volumes because one congress is usually in 2/3 volumes.
bigrams_paths = list_all_extension_files(bigrams_path, extension='.csv')

# The dataframe "bigrams_df" contains one row per each congress number. It also contains the path of the 
# bigrams .csv associated. 
bigrams_df = pd.DataFrame(data=bigrams_paths, columns=['path'])
bigrams_df['congress'] = bigrams_df.path.str.split("/", n=3, expand=True)[3]
bigrams_df['congress'] = bigrams_df['congress'].str.replace(".csv","")
bigrams_df['congress'] = bigrams_df['congress'].str.replace("congress_","").astype(int)

# The dataframe congress18 contains the occurrences of each bigram in that congress.
# One row represent the bigrams said by one congressmen, whose name is in the column "congressmen". 
# The columns represent all the possible biagrams, the name of the congressmen and the party affiliation.
p_congress18 = bigrams_df[bigrams_df.congress==18].path[0]

congress_18 = read_congress(p_congress18)

In [31]:
congress_18.columns

Index(['congressmen', '('00', '10900')', '('00', '20213')', '('00', '30')',
       '('1', '12')', '('1', '1823')', '('1', '1824')', '('1', '2')',
       '('1', 'believe')', '('1', 'from')',
       ...
       '('мarch', '1824')', '('мay', '1824')', '('н', 'оf')', '('н', 'оr')',
       '('не', 'wa')', '('о', 'а')', '('оf', 'r')', '('оr', 'r')',
       '('соmmerce', 'with')', 'party_code'],
      dtype='object', length=22926)

In [32]:
congress_18.head()

Unnamed: 0,congressmen,"('00', '10900')","('00', '20213')","('00', '30')","('1', '12')","('1', '1823')","('1', '1824')","('1', '2')","('1', 'believe')","('1', 'from')",...,"('мarch', '1824')","('мay', '1824')","('н', 'оf')","('н', 'оr')","('не', 'wa')","('о', 'а')","('оf', 'r')","('оr', 'r')","('соmmerce', 'with')",party_code
0,abbot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7777
1,adams,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8888
2,alexander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1346
3,alexander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7777
4,allen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8000
