In [17]:
# imports
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os
import bz2

In [29]:
def generate_speaker_affiliations(parquet_path, out_path, remove_raw=False):

    # load speaker info
    speaker_info = pd.read_parquet(parquet_path)
    speaker_info = speaker_info[["id", "label", "party"]]

    # take the speakers that have an assigned political affiliation
    speaker_info = speaker_info.dropna()

    # take the first affiliation only (TODO: this is most likely NOT the best method, think what to do when someone has multiple affiliations)
    speaker_info["party"] = speaker_info["party"].apply(lambda x: int(x[0][1:]))
    speaker_info["id"] = speaker_info["id"].apply(lambda x: int(x[1:]))
    
    print(f"Speaker affiliation DF:\n {speaker_info.head()}")

    speaker_info.to_pickle(out_path)

    if remove_raw:
        os.remove(parquet_path)

In [43]:
def save_pickle(json_path_bz2, pickle_path, remove_raw=False):
    data = [] 
    with bz2.open(json_path_bz2, 'rb') as s_file:
        print("Quotation file opened...")
        for instance in tqdm(s_file):
            instance = json.loads(instance) # loading a sample
            
            # if there is no speaker, skip current row
            if not instance['qids']:
                continue
            
            # else proceed to read the data
            row = dict()
            row['speaker_id'] = int(instance['qids'][0][1:])
            row['quote_id'] = instance['quoteID']
            row['quotation'] = instance['quotation']
            data.append(row)

        df = pd.DataFrame(data)
        df.to_pickle(pickle_path)
    
    if remove_raw:
        os.remove(json_path_bz2)

In [49]:
def join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, out_path):
    # join the quote data with their corresponding labels
    merged = pd.merge(left=df_quotes, left_on="speaker_id", right=df_affiliations, right_on="id")
    merged = merged.drop(columns=["id"])
    merged = merged.rename(columns = {"label": "speaker"})
    print(f"Merged DF: \n{merged.head()}")
    merged.to_pickle(out_path)

In [51]:
# one time operation - generate a pickle file containing speaker's affiliations
PARQUET_PATH = "../data/raw/speaker_attributes.parquet"
SPEAKER_AFFILIATIONS_OUT_PATH = "../data/binary/speaker_attributes.pickle"

print("Generating speaker affiliations DF...")
if not os.path.exists(SPEAKER_AFFILIATIONS_OUT_PATH):
    generate_speaker_affiliations(PARQUET_PATH, SPEAKER_AFFILIATIONS_OUT_PATH)
print("Done.\n")

# dataset loading - perform for each batch of the data (2015, 2016, ..., 2020)
years = [2019]
for year in years:
    DATASET_PATH_JSON_BZ2 = f"../data/raw/quotes-{year}.json.bz2"
    DATASET_PATH_PICKLE = f"../data/binary/quotes-{year}.pickle"
    MERGED_OUT_PATH = f"../data/binary/data-{year}.pickle"

    print("Generating quotes DF...")
    if not os.path.exists(DATASET_PATH_PICKLE):
        save_pickle(DATASET_PATH_JSON_BZ2, DATASET_PATH_PICKLE)
    print("Done.\n")

    df_quotes = pd.read_pickle(DATASET_PATH_PICKLE)
    df_affiliations = pd.read_pickle(SPEAKER_AFFILIATIONS_OUT_PATH)
    
    print("Generating merged df...")
    if not os.path.exists(MERGED_OUT_PATH):
        join_quotes_with_speaker_affiliations(df_quotes, df_affiliations, MERGED_OUT_PATH)
    print("Done.\n")

Generating speaker affiliations DF...
Done.

Generating quotes DF...
Quotation file opened...


21763302it [20:45, 17478.68it/s]


Done.

Generating merged df...
   speaker_id           quote_id  \
0       22686  2019-04-08-048753   
1    42336656  2019-05-15-053302   
2    16672061  2019-02-27-055406   
3      809063  2019-12-08-023053   
4     1971786  2019-02-21-000088   

                                           quotation  
0       It is immoral. It is harmful. It is hurtful.  
1  It is important for our equine science student...  
2  It is important to many Native American tribes...  
3  It is impossible, biologically, truly to `rest...  
4  [ Chilton ] put it on a little tape recorder a...  
     id                    label    party
0    23        George Washington   327591
3   207           George W. Bush    29468
5   368         Augusto Pinochet   327591
11  815  Gabriel Gonzáles Videla  1759368
14  873             Meryl Streep    29552
Merged DF: 
   speaker_id           quote_id  \
0       22686  2019-04-08-048753   
1       22686  2019-05-26-025817   
2       22686  2019-03-20-001438   
3       22686 