# Master Builder

This notebook has the whole pipeline of build / analyze for the antiwork dataset.

General components
- 80,000 ids and general details
- 80,00 pickle files of complete posts `/raw_data`

General Steps
- load up
- create subset
- apply different analysis measures


Analysis Measures
- automatic keyword generation
- identified kewyord flagging
- VADER Scores



In [240]:
#Libraries and setup

import pandas as pd
import os

import praw
import pickle
import pprint
import csv

from textblob import TextBlob
from nltk.corpus import stopwords

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)


from nltk.sentiment.vader import SentimentIntensityAnalyzer

print("Done!")

Done!


In [241]:
##Flags to skip steps
#Avoid loading up all the pickles if there is no change there
SKIP_PICKLES = True
#Generated Keywords Section
SKIP_KEYWORD_GEN = True

BIG_DS_FILE = "antiwork_massive.csv"


#How big to make the top and low dataframes
SAMPLE_SIZE = 1000


#Filenames for smaller subset datasets
PLUS_FILENAME = "antiwork_plus_top_1000.csv"
NEG_FILENAME = "antiwork_neg_top_1000.csv"


#Entries that are very long that need to be truncated
MAX_LENGTH_POST = 30000
LONG_ENTRIES = [
    "qvne64",
    "r61fv8",
    "rmegty"
]


NUM_TO_KEEP = 25
GENERATED_KEYWORDS = "keywords_1000_top_bottom.csv"

#Some fixed keywords
FIXED_KEYWORDS = [
    "strike",
    "union",
    "capitalism",
    "socialism",
    "anarchism",
    "resignation",
    "quit",
    "abolition",
    "asshole",
    "labour|labor",
    "scab",
    "contract",
    "temp",
    "wage",
    "fired"
]


#PICKLE section

PICKLE_PATH = "/raw_data"

#Fields from PRAWL objects to keep during pickle load
FIELDS_TO_KEEP = [
 #'all_awardings',
 #'allow_live_comments',
 #'approved_at_utc',
 #'approved_by',
 #'archived',
 'author',
 #'author_flair_background_color',
 #'author_flair_css_class',
 #'author_flair_richtext',
 #'author_flair_template_id',
 #'author_flair_text',
 #'author_flair_text_color',
 #'author_flair_type',
 #'author_fullname',
 #'author_is_blocked',
 #'author_patreon_flair',
 #'author_premium',
 #'award',
 #'awarders',
 #'banned_at_utc',
 'banned_by',
 #'can_gild',
 #'can_mod_post',
 #'category',
 #'clear_vote',
 #'clicked',
 #'comment_limit',
 #'comment_sort',
 #'comments',
 #'content_categories',
 #'contest_mode',
 #'created',
 'created_utc',
 #'crosspost',
 #'delete',
 #'disable_inbox_replies',
 #'discussion_type',
 #'distinguished',
 #'domain',
 'downs',
 #'downvote',
 #'duplicates',
 #'edit',
 #'edited',
 #'enable_inbox_replies',
 #'flair',
 #'fullname',
 #'gild',
 #'gilded',
 #'gildings',
 #'hidden',
 #'hide',
 #'hide_score',
 'id',
 #'id_from_url',
 #'is_created_from_ads_ui',
 #'is_crosspostable',
 #'is_meta',
 #'is_original_content',
 #'is_reddit_media_domain',
 #'is_robot_indexable',
 #'is_self',
 #'is_video',
 'likes',
 #'link_flair_background_color',
 #'link_flair_css_class',
 #'link_flair_richtext',
 #'link_flair_text',
 #'link_flair_text_color',
 #'link_flair_type',
 #'locked',
 #'mark_visited',
 #'media',
 #'media_embed',
 #'media_only',
 #'mod',
 #'mod_note',
 #'mod_reason_by',
 #'mod_reason_title',
 #'mod_reports',
 #'name',
 #'no_follow',
 #'num_comments',
 #'num_crossposts',
 #'num_duplicates',
 #'num_reports',
 #'over_18',
 #'parent_whitelist_status',
 #'parse',
 #'permalink',
 #'pinned',
 #'pwls',
 #'quarantine',
 'removal_reason',
 #'removed_by',
 #'removed_by_category',
 #'reply',
 #'report',
 #'report_reasons',
 #'save',
 #'saved',
 'score',
 #'secure_media',
 #'secure_media_embed',
 'selftext',
 #'selftext_html',
 #'send_replies',
 'shortlink',
 #'spoiler',
 #'stickied',
 #'subreddit',
 #'subreddit_id',
 #'subreddit_name_prefixed',
 #'subreddit_subscribers',
 #'subreddit_type',
 #'suggested_sort',
 #'thumbnail',
 #'thumbnail_height',
 #'thumbnail_width',
 'title',
 #'top_awarded_type',
 #'total_awards_received',
 #'treatment_tags',
 #'unhide',
 #'unsave',
 'ups',
 #'upvote',
 'upvote_ratio',
 'url',
 #'user_reports',
 'view_count',
 #'visited',
 #'whitelist_status',
 #'wls'
]


## Load & Restrict Columns

Opens all the pickles etc if needed

**SKIP_PICKLES** will rely on file contents instead

In [242]:

if SKIP_PICKLES == False:
    
    print("Loading Pickle Data")
    
    #Whole Dataset assemble
    path = os.getcwd() + PICKLE_PATH

    total = []
    problems = []
    count = 0

    for root,dirs,files in os.walk(path):

        for f in files:

            count+=1
            if count % 5000 == 0:
                print(count)

            try:

                sub = pickle.load(open(root+"/"+f,"rb"))
                line = []

                for attrib in FIELDS_TO_KEEP:
                    entry = str(getattr(sub,attrib))
                    line.append(entry)

                total.append(line)

            except:
                problems.append(f)



    ds_build = pd.DataFrame(total,columns=FIELDS_TO_KEEP)

    #Convert Timestamps
    dates = []
    for row in ds_build.itertuples(index=False):
      dates.append(pd.to_datetime(row.created_utc,unit='s'))

    dates_df = pd.DataFrame(dates,columns = ["timestamp"])
    ds_build = ds_build.join(dates_df)

    
    ds_build.to_csv(BIG_DS_FILE,index=False)
    
else:
    
    ds_build = pd.read_csv(BIG_DS_FILE)
    



print('Done with assembling massive DF')

Done with assembling massive DF


## Restrict Rows

- Currently items with `selftext` not null
- Truncates down those really long column entries

In [243]:
#Must have full_text
# ie. post are text based

ds_build = ds_build[ds_build["selftext"].notnull()]
ds_build.reset_index()

print("Done with dropping rows")


#Enforce Max Size for known offenders

for entry in LONG_ENTRIES:
    ds_build.loc[ds_build["id"] == entry,["selftext"]] = ds_build[ds_build["id"] == entry]["selftext"].str.slice(start=0,stop=MAX_LENGTH_POST)

print("Done with truncating long entries")

Done with dropping rows
Done with truncating long entries


## Split Into top & bottom datasets

In [244]:
#antiwork_full = antiwork_full.sort_values(by=["score","upvote_ratio"],ascending=False)[0:5000]

#High Scores
top_scoring = ds_build.sort_values(by=["score","upvote_ratio"],ascending=False)[0:SAMPLE_SIZE]

#Low Scores
low_scoring = ds_build.sort_values(by=["score","upvote_ratio"],ascending=True)[0:SAMPLE_SIZE]

print("Subsets Constructed!")

Subsets Constructed!


## Analyze - VADER

In [245]:

sid = SentimentIntensityAnalyzer()

print("Applying VADER to top posts")

top_scoring["vscore_pos"] = 0.0
top_scoring["vscore_neg"] = 0.0
top_scoring["vscore_neu"] = 0.0
top_scoring["vscore_compound"] = 0.0


for index, row in top_scoring.iterrows():
    ss = sid.polarity_scores(row["selftext"])
    top_scoring.at[index,'vscore_pos'] = float(ss["pos"])
    top_scoring.at[index,'vscore_neg'] = float(ss["neg"])
    top_scoring.at[index,'vscore_neu'] = float(ss["neu"])
    top_scoring.at[index,'vscore_compound'] = float(ss["compound"])


print("Applying VADER to low posts")

low_scoring["vscore_pos"] = 0.0
low_scoring["vscore_neg"] = 0.0
low_scoring["vscore_neu"] = 0.0
low_scoring["vscore_compound"] = 0.0


for index, row in low_scoring.iterrows():
    ss = sid.polarity_scores(row["selftext"])
    low_scoring.at[index,'vscore_pos'] = float(ss["pos"])
    low_scoring.at[index,'vscore_neg'] = float(ss["neg"])
    low_scoring.at[index,'vscore_neu'] = float(ss["neu"])
    low_scoring.at[index,'vscore_compound'] = float(ss["compound"])
    

print("Done!")

Applying VADER to top posts
Applying VADER to low posts
Done!


## Analyze - Apply Identified Keywords

In [246]:

print("Appling Fixed Keyword Search")

for kw in FIXED_KEYWORDS:
    col_label = "has_fixed_"+kw.replace(" ","_")
    
    top_scoring[col_label] = top_scoring.selftext.str.contains(kw)
    top_scoring[col_label] = top_scoring[col_label].replace({False:0,True:1})
    
    low_scoring[col_label] = low_scoring.selftext.str.contains(kw)
    low_scoring[col_label] = low_scoring[col_label].replace({False:0,True:1})
    
    
print("Done!")

Appling Fixed Keyword Search
Done!


## Analyze - Generate Keywords

**SKIP_KEYWORD_GEN** to forego and load from **GENERATED_KEYWORD**

In [247]:
#Need to manually add some stinker words
stopwords_final = stopwords.words('english')
stopwords_final.append('’ s')
stopwords_final.append('’ m')
stopwords_final.append('edit')
stopwords_final.append('# x200b')
stopwords_final.append('’ t')
stopwords_final.append('’ ve')
stopwords_final.append('’ re')
stopwords_final.append('’ ll')
stopwords_final.append('> >')
stopwords_final.append('’ d')
stopwords_final.append('[ https')
stopwords_final.append('**\ [')
stopwords_final.append("ca n't")
stopwords_final.append('don ’ t')
stopwords_final.append('didn ’ t')
stopwords_final.append("wo n't")
stopwords_final.append('inc.**')
stopwords_final.append("n't care")
stopwords_final.append('isn ’ t')
stopwords_final.append('')
stopwords_final.append('')
stopwords_final.append('')
stopwords_final.append('')
#stopwords_final.append('')


In [248]:

if SKIP_KEYWORD_GEN == False:

    keyword_freq = dict()

    print("Generating Automatic Keywords")
    #Top
    for index, row in top_scoring.iterrows():
        tb = TextBlob(row["selftext"])

        for word in tb.noun_phrases:
            if word.lower() not in stopwords_final:
                if word in keyword_freq:
                    keyword_freq[word] += 1
                else:
                    keyword_freq[word] = 1

    #Low
    for index, row in low_scoring.iterrows():
        tb = TextBlob(row["selftext"])

        for word in tb.noun_phrases:
            if word.lower() not in stopwords_final:
                if word in keyword_freq:
                    keyword_freq[word] += 1
                else:
                    keyword_freq[word] = 1

    with open(GENERATED_KEYWORDS,'w') as csv_file:
        writer = csv.writer(csv_file)
        for w in sorted(keyword_freq, key=keyword_freq.get, reverse=True)[0:NUM_TO_KEEP]:
            #print(w,",",keyword_freq[w])
            writer.writerow([w,keyword_freq[w]])

    gen_kw_df = pd.DataFrame.from_dict([keyword_freq])
    gen_kw_df = gen_kw_df.T.reset_index(level=0)
    gen_kw_df.columns = ["keyword","freq"]
    gen_kw_df = gen_kw_df.sort_values(by="freq",ascending=False)[0:NUM_TO_KEEP]
    gen_kw_df = gen_kw_df.reset_index()
    del(gen_kw_df["index"])

else:
    
    gen_kw_df = pd.read_csv(GENERATED_KEYWORDS,header=None)
    gen_kw_df.columns = ["keyword","freq"]
    
    
print("\nDone!")
    


Done!


In [249]:
gen_kw_df

Unnamed: 0,keyword,freq
0,well,32
1,hr,32
2,minimum wage,28
3,fuck,24
4,christmas,24
5,america,24
6,thanks,18
7,black,17
8,boss,16
9,oh,15


## Analyze - Apply Generated Keywords

In [250]:
for kw in gen_kw_df["keyword"]:
    
    col_label = "has_generated_"+kw.replace(" ","_")
    
    top_scoring[col_label] = top_scoring.selftext.str.contains(kw)
    top_scoring[col_label] = top_scoring[col_label].replace({False:0,True:1})
    
    low_scoring[col_label] = low_scoring.selftext.str.contains(kw)
    low_scoring[col_label] = low_scoring[col_label].replace({False:0,True:1})

## Final sets to disk

In [251]:
#Write out Datasets to file

top_scoring.to_csv(PLUS_FILENAME,index=False)
low_scoring.to_csv(NEG_FILENAME, index=False)