## Preprocessing notebook

In [1]:
import numpy as np
import pandas as pd
import time
import sys

from sklearn.model_selection import train_test_split

In [2]:
# Scores at which a comment is considered "non-neutral". Pos score is higher since comments tend to be upvoted
#NEGATIVE_SCORE_CUTOFF = -5

def filterComments(df, filter_automod, filter_deleted_posts, filter_neutral_comments, NEG_SCORE_CUTOFF=-sys.maxsize, POS_SCORE_CUTOFF=15):

    inputs = (filter_automod, filter_deleted_posts, filter_neutral_comments)
    if not all(x in range(2) for x in inputs):
        raise Exception("All inputs must be in range [0, 1]")

    if filter_automod:
        """#removing rows with AutoMod as author. ( ''AutoModerator is a system built into reddit that allows moderators 
        #to define "rules" (consisting of checks and actions) to be automatically applied to posts in their subreddit. 
        #It supports a wide range of functions with a flexible rule-definition syntax, and can be set up to handle many 
        #common moderation tasks automatically.'')

        #Every post usually contains a Pinned comment from Automod, hence why we are removing those. """

        indexAutomod = df[(df['author'] == 'AutoModerator')].index
        df.drop(indexAutomod, inplace=True)


    if filter_deleted_posts:
        #removing comments that were deleted or removed
        indexDeleted = df[(df['body'] == '[deleted]') | (df['body'] == '[removed]')].index

        df.drop(indexDeleted, inplace=True)
        
    if filter_neutral_comments:
        #removing comments with insufficient engagement
        index_low_score = df[(df['score']>NEG_SCORE_CUTOFF) & (df['score']<POS_SCORE_CUTOFF)].index
        
        df.drop(index_low_score, inplace=True)

In [3]:
df_reps = pd.read_csv('republican_comments_raw.csv')
df_dems1 = pd.read_csv('democrat_comments_raw.csv')
df_dems2 = pd.read_csv('democrat_comments_raw2.csv')


# filtering 
# removing comments from automod, removed comments, comments with a neutral score 
# filterComments(df, filter_automod, filter_deleted_posts, filter_neutral_comments):
# cutoff scores were manually tuned to have a similar proportion of rep/dem comments
filterComments(df_dems1, 1, 1, 1, POS_SCORE_CUTOFF=7)
filterComments(df_dems2, 1, 1, 1, POS_SCORE_CUTOFF=7)
filterComments(df_reps, 1, 1, 1, POS_SCORE_CUTOFF=5)


# merging republican and democrat comments
frames = [df_reps, df_dems1, df_dems2]
df = pd.concat(frames)

#shuffling df
df = df.sample(frac=1).reset_index(drop=True)

df.head(5)

Unnamed: 0,author,body,score,subreddit,created,id,post_title,post_id
0,agutema,Georgia is Perdue for a change!,15,politics,1604861243,gbn0bmr,Georgia’s twin January runoffs are set to dete...,jqfbym
1,19snow16,I have dementia. Doctor Ronny will swear by that.,10,politics,1598994889,g3n5xbz,Manhattan DA Again Hints That Trump Investigat...,ikm9od
2,pgabrielfreak,No shit. Trump busted in the 80's for laubderi...,7,politics,1552576791,eiijxhv,The Manafort case is a reminder that we invest...,b0wfug
3,,Reddit should be renamed CHINA. Since they own...,5,Conservative,1585926937,fmbibrz,Whistleblowing coronavirus doctor at Wuhan hos...,ftvfit
4,Assuranceagent,Cardi B taking on Candace Owens in a battle of...,335,Conservative,1599508286,g4dbe4t,Candace Owens slams Cardi B for hypocrisy as w...,ioeic2


In [4]:
df[df['subreddit'] == 'Conservative'].shape
df[df['subreddit'] == 'politics'].shape

(144724, 8)

In [5]:
# text length cutoff
MAX_WORD_COUNT = 256 

#keeps first MAX_WORD_COUNT words in text
def trim_text(text):
    text = text.split(maxsplit=MAX_WORD_COUNT)
    text = ' '.join(text[:MAX_WORD_COUNT])
    return text


df.drop(columns=['author', 'score', 'created', 'id', 'post_title', 'post_id'], inplace=True)


# asserting text type is str
df['body'] = df['body'].astype(str)

# trimming text
df['body'] = df['body'].apply(trim_text)

# since we are planning on using bert-base-uncased pretrained model, text should be lowercase
df['body'] = df['body'].str.lower()



print(df.shape)

# converting subreddit vals to ints --> r/republican is represented by 0, r/democrats by 1

index_democrat = df[df['subreddit'] == 'politics'].index
df['subreddit'] =0 
df.loc[index_democrat, 'subreddit'] =1

df.head(5)

(302896, 2)


Unnamed: 0,body,subreddit
0,georgia is perdue for a change!,1
1,i have dementia. doctor ronny will swear by that.,1
2,no shit. trump busted in the 80's for laubderi...,1
3,reddit should be renamed china. since they own...,0
4,cardi b taking on candace owens in a battle of...,0


In [6]:
dems =df[df['subreddit'] ==1]
reps =df[df['subreddit'] ==0]

print(f'dems: {dems.shape} reps: {reps.shape}')

dems: (144724, 2) reps: (158172, 2)


In [7]:
# 70/15/15 train/valid/test split

train, temp = train_test_split(df, train_size=0.7)
valid, test = train_test_split(temp, train_size=0.5)

In [8]:
#saving dfs to csv

train.to_csv('train.csv', index=False)
valid.to_csv('valid.csv', index=False)
test.to_csv('test.csv', index=False)