# Data Processing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_loc = 'data/' # location of data folder
daily_loc = 'data/hein-daily/' # location of hein-daily Congressional Record data

In [3]:
def process_data(session_num):
    """
    Process raw data from specified session number. 
        
        Parameters:
            session_num (string): A valid Congressional Record session id
            
        Returns:
            df (DataFrame): A DataFrame of processed data from specified session number
    """
    
    print('processing session ' + session_num)
    
    # file names
    descr = 'descr_' + session_num + '.txt'
    speeches = 'speeches_' + session_num + '.txt'
    speaker_data = session_num + '_SpeakerMap.txt'
    
    # read in 3 relevant files
    metadata = pd.read_csv(daily_loc+descr, sep='|', dtype={'speech_id': object})
    df_speakers = pd.read_csv(daily_loc+speaker_data, sep='|', encoding= 'unicode_escape', 
                              dtype={'speech_id': object})
    df_speech = pd.read_csv(daily_loc+speeches, sep='|', encoding= 'unicode_escape', 
                            encoding_errors = 'ignore',on_bad_lines='skip',
                           dtype={'speech_id': object, 'speech': object})
    
    # convert date column to datetime
    metadata ['date']= pd.to_datetime(metadata ['date'], format='%Y%m%d')
    
    # add date to df_speech from metadata
    df_speech = df_speech.merge(metadata[['speech_id', 'date']], on='speech_id')
    
    # merge to get necessary columns only
    df_1 = df_speakers[['speakerid', 'speech_id', 'lastname', 'firstname', 'state', 'party']]
    df_all = df_1.merge(df_speech, on='speech_id')
    
    # write to csv
    file_name = data_loc + 'processed_data_' + session_num + '.csv'
    df_all.to_csv(file_name)
    
    # return df
    return df_all

In [4]:
# process all sessions
ids = ['097', '098', '099', '100', '101', '102', '103', '104', '105', 
       '106', '107', '108', '109', '110', '111', '112', '113', '114']
for i in ids:
    process_data(i)

processing session 097
processing session 098
processing session 099
processing session 100
processing session 101
processing session 102
processing session 103
processing session 104
processing session 105
processing session 106
processing session 107
processing session 108
processing session 109
processing session 110
processing session 111
processing session 112
processing session 113
processing session 114
