In [1]:
import pandas as pd
import time
import numpy as np
import bs4
import re
import warnings
warnings.filterwarnings("error")

In [172]:
# create separate column for created year, month, day, and time. Standardize to local time 
def time_clean(df):
    df['created_year'] = df['created_utc'].map(lambda timestamp: time.strftime('%Y', time.localtime(timestamp)))
    df['created_month'] = df['created_utc'].map(lambda timestamp: time.strftime('%m', time.localtime(timestamp)))
    df['created_day'] = df['created_utc'].map(lambda timestamp: time.strftime('%d', time.localtime(timestamp)))
    df['created_time_gm'] = df['created_utc'].map(lambda timestamp: time.strftime('%H:%M:%S', time.localtime(timestamp)))
    df['created_date'] = df['created_utc'].map(lambda timestamp: time.strftime('%d-%m-%Y', time.localtime(timestamp)))
    return df

# remove URLs from comment body
# source: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
def remove_urls(df):
    df['no_url_body'] = df['body'].map(lambda body: re.sub(r'https?:\S+', '', body))
    return df

# remove quoted text from comment body
def remove_comments(df):
    df['no_comment_body'] = df['no_url_body'].map(lambda body: re.sub('(&gt;)+?.*?\n', '', body))
    return df 

# parse mark-up language in comment body
def markup_convert(df):
    df['clean_body'] = df['no_comment_body'].map(
    lambda body: bs4.BeautifulSoup(body, "lxml").text.replace("\n"," "))
    return df

In [173]:
# read in raw comment data
progun = pd.read_pickle('../Data/progun2018comments.pkl')
liberal = pd.read_pickle('../Data/liberalgunowners2018comments.pkl')
guns_a = pd.read_pickle('../Data/guns2018comments_a.pkl')
guns_b = pd.read_pickle('../Data/guns2018comments_b.pkl')

In [174]:
# read in progun comments, clean timestamp, remove URLs, remove comments 
progun = remove_comments(remove_urls(time_clean(pd.read_pickle("../Data/progun2018comments.pkl")).reset_index()))

# drop comments with '.' as body (avoids BeautifulSoup error)
progun.drop([2267, 9570, 45352, 83835, 100679, 123695], inplace = True)

# convert from markup 
progun = markup_convert(progun)

In [175]:
# read in liberal comments, clean timestemp, remove URLs, remove comments
liberal = remove_comments(remove_urls(time_clean(pd.read_pickle("../Data/liberalgunowners2018comments.pkl")).reset_index()))

# drop '.' comments
liberal.drop([85903, 122013, 132797], inplace = True)

# convert from markup
liberal = markup_convert(liberal)


In [176]:
# read in half of guns comments, clean timestamp, remove URLs, remove comments 
guns_a = remove_comments(remove_urls(time_clean(pd.read_pickle('../Data/guns2018comments_a.pkl')).reset_index()))

# drop '.' comments
guns_a.drop(guns_a.index[guns_a['no_comment_body'] == '.'].tolist(), inplace = True)

# convert from markup
guns_a = markup_convert(guns_a)

In [177]:
# read in second half of guns comments, clean timestamp, remove URLs, remove comments 
guns_b = remove_comments(remove_urls(time_clean(pd.read_pickle('../Data/guns2018comments_b.pkl')).reset_index()))

# drop '.' comments 
guns_b.drop(guns_b.index[guns_b['no_comment_body'] == '.'].tolist(), inplace = True)

# convert from markup 
guns_b = markup_convert(guns_b)

In [178]:
# write all comments to CSV files
progun.to_csv('../ProcessedData/progunAug7.csv')
liberal.to_csv('../ProcessedData/liberalAug7.csv')
guns_a.to_csv('../ProcessedData/guns_aAug7.csv')
guns_b.to_csv('../ProcessedData/guns_bAug7.csv')