In [38]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [39]:
hist = pd.read_csv("../datasets/history_subs5", index_col = False)
hist_2 = pd.read_csv("../datasets/history_subs2", index_col = False) # saving the two scrapes as separate variables
hist_3 = pd.read_csv("../datasets/history_subs3", index_col = False)
hist_4 = pd.read_csv("../datasets/history_subs4", index_col = False)
cons = pd.read_csv("../datasets/conspiracy_subs5", index_col = False)
cons_2 = pd.read_csv('../datasets/conspiracy_subs2', index_col = False)
cons_3 = pd.read_csv("../datasets/conspiracy_subs3", index_col = False)

In [40]:
hist.title

0      The Spencer Rifle (A little info on a historic...
1                         Organising Your Family History
2      Best Deals &amp; Offer Sony confirms Sony conf...
3                      A Brief History of Women's Rights
4                                                  Helen
                             ...                        
995    La Thaïlande, une dictature militaire sous com...
996    Phone X Korean High super ace duplicate | Genu...
997    LG India Job Vacancies In 2019 | All Engineeri...
998    Get Electrical Engineering Best Courses for Fr...
999    Why Cold War era politicians were so charismatic?
Name: title, Length: 1000, dtype: object

In [41]:
cons.title

0      SOS the government is Gang Stalking me and pla...
1                    Brainwashing? Is that a real thing?
2       Let us not Forget what Europe did during 1900-45
3                    Hillary Clinton's guilt demystified
4      The Coronavirus was created in a laboratory an...
                             ...                        
995    The Pentagon Wars is a 1998 dark comedy film f...
996        Prolly against the rules. But any good ideas?
997      Florida Sandy Hook Conspiracy Theorist Arrested
998    Group of Real Americans Concerned About Consti...
999    I think China is planning to let this corona v...
Name: title, Length: 1000, dtype: object

In [42]:
# combining the subreddit scrapes

hist = hist.append(hist_2, sort = True)
hist = hist.append(hist_3, sort = True)
hist = hist.append(hist_4, sort = True)
cons = cons.append(cons_2, sort = True)
cons = cons.append(cons_3, sort = True)

In [43]:
hist.shape 

(4000, 83)

In [44]:
hist.title

0      The Spencer Rifle (A little info on a historic...
1                         Organising Your Family History
2      Best Deals &amp; Offer Sony confirms Sony conf...
3                      A Brief History of Women's Rights
4                                                  Helen
                             ...                        
995                                                  NaN
996                                                  NaN
997                                                  NaN
998                                                  NaN
999                                                  NaN
Name: title, Length: 4000, dtype: object

In [45]:
cons.shape

(3000, 74)

In [46]:
cons.title

0      SOS the government is Gang Stalking me and pla...
1                    Brainwashing? Is that a real thing?
2       Let us not Forget what Europe did during 1900-45
3                    Hillary Clinton's guilt demystified
4      The Coronavirus was created in a laboratory an...
                             ...                        
995    Pirbright Institute, funded by the Bill and Me...
996    The Pentagon Wars is a 1998 dark comedy film f...
997        Prolly against the rules. But any good ideas?
998      Florida Sandy Hook Conspiracy Theorist Arrested
999    Group of Real Americans Concerned About Consti...
Name: title, Length: 3000, dtype: object

Looking at both scrapes it looks like we have 4000 submissions /r/history and 3000 from /r/conspiracy.

In [47]:
# see what the columns look like
hist.columns

Index(['Unnamed: 0', 'all_awardings', 'allow_live_comments',
       'associated_award', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'banned_by', 'body', 'can_mod_post', 'collapsed_because_crowd_control',
       'contest_mode', 'created_utc', 'distinguished', 'domain', 'edited',
       'full_link', 'gildings', 'id', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_submitter', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type',
       'link_id', 'locked', 'media', 'media_embed', 'media_only', 'no_foll

In [48]:
# title is the "title" of the submission, selftext is where the body text is written
hist = hist[['author', 'id', 'selftext', 'subreddit', 'title']]
hist.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,santee2171,ew76w8,,history,The Spencer Rifle (A little info on a historic...
1,chiddicks,ew70pw,[removed],history,Organising Your Family History
2,Subhasmitasamal,ew6g9k,,history,Best Deals &amp; Offer Sony confirms Sony conf...
3,dineshtaylr,ew6ayf,,history,A Brief History of Women's Rights
4,atumhal,ew656m,[removed],history,Helen


In [49]:
# let's get rid of all the comments made by the auto-moderator

hist = hist[hist['author'] != 'AutoModerator']

In [50]:
hist.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,santee2171,ew76w8,,history,The Spencer Rifle (A little info on a historic...
1,chiddicks,ew70pw,[removed],history,Organising Your Family History
2,Subhasmitasamal,ew6g9k,,history,Best Deals &amp; Offer Sony confirms Sony conf...
3,dineshtaylr,ew6ayf,,history,A Brief History of Women's Rights
4,atumhal,ew656m,[removed],history,Helen


In [51]:
hist.shape

(3916, 5)

It looks like removing all the automod comments dropped our pool of comments a bit. Let's get rid of removed/deleted submissions as well

In [52]:
hist = hist[hist['title'] != '[deleted]']
hist = hist[hist['title'] != '[removed]']

In [53]:
#  see how many comments we have left
hist.shape

(3916, 5)

In [54]:
hist['title'] = hist['title'].replace(r'[^a-zA-Z0-9\s\/]', '', regex=True) # using .replace & regex tp replace every
                                                                         # non-alphanumeric character 
hist['title'] = hist['title'].replace(r'[\r\n]+', ' ', regex=True) # replacing cairrage returns

In [55]:
# check for NaNs

hist.isna().sum()

author          0
id              0
selftext     2508
subreddit       0
title        1924
dtype: int64

In [62]:
hist.dropna(inplace = True)

In [65]:
#  let's make sure we don't have any duplicate posts

hist['id'].drop_duplicates(inplace = True)

In [66]:
hist.shape

(1408, 5)

We have a final pool of 1408 submissions from /r/history to import and use for our modeling. Time to clean the /r/conspiracy data!

In [67]:
cons = cons[['author', 'id', 'selftext', 'subreddit', 'title']]

In [68]:
cons.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,dopamineiscool,ew7gd2,The government is manipulating me with mind co...,conspiracy,SOS the government is Gang Stalking me and pla...
1,ifiagreedwithu,ew7gbu,,conspiracy,Brainwashing? Is that a real thing?
2,GreyFox78659,ew7de0,https://images.app.goo.gl/fy6em1W5pDvTKUNW9\n\...,conspiracy,Let us not Forget what Europe did during 1900-45
3,Locomule,ew7czg,When you become designated to handle classifie...,conspiracy,Hillary Clinton's guilt demystified
4,DeViN_tHa_DuDe,ew7bk3,,conspiracy,The Coronavirus was created in a laboratory an...


In [69]:
# doing the same title/author clean-up
cons = cons[cons['title'] != '[deleted]']
cons = cons[cons['title'] != '[removed]']
cons = cons[cons['author'] != 'AutoModerator']

In [70]:
cons['title'] = cons['title'].replace(r'[^a-zA-Z0-9\s\/]', '', regex=True) # using .replace & regex tp replace every
                                                                         # non-alphanumeric character 
cons['title'] = cons['title'].replace(r'[\r\n]+', ' ', regex=True) # replacing cairrage returns

In [71]:
cons.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,dopamineiscool,ew7gd2,The government is manipulating me with mind co...,conspiracy,SOS the government is Gang Stalking me and pla...
1,ifiagreedwithu,ew7gbu,,conspiracy,Brainwashing Is that a real thing
2,GreyFox78659,ew7de0,https://images.app.goo.gl/fy6em1W5pDvTKUNW9\n\...,conspiracy,Let us not Forget what Europe did during 190045
3,Locomule,ew7czg,When you become designated to handle classifie...,conspiracy,Hillary Clintons guilt demystified
4,DeViN_tHa_DuDe,ew7bk3,,conspiracy,The Coronavirus was created in a laboratory an...


In [72]:
cons.dropna(inplace = True)

In [73]:
cons['id'].drop_duplicates(inplace = True)

In [74]:
cons.shape

(1287, 5)

In [75]:
# exporting our cleaned data

hist_clean = hist.to_csv("../datasets/clean_hist_sub.csv")
cons_clean = cons.to_csv("../datasets/clean_cons_sub.csv")