In [1]:
import pandas as pd
import numpy as np
import datetime as datetime
import time
import requests
import json
np.random.seed(42)

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=space&size=500&after=30d'

In [3]:
#Note- this function was adapted from an in-class codealong taught by Brian Collins(TA, Washington DC, GA DSI-5)

def query_pushshift(subreddit, kind='submission', skip=30, times=6, size=500, start=0, pseudoverbose=5,
                   subfield = ['title', 'selftext', 'subreddit', 'created_utc',
                               'author', 'num_comments', 'score', 'is_self'],
                    comfields=['body', 'score', 'created_utc', 'subreddit', 'author']):
#subreddit= name of the subreddit, 
#kind={'submission', 'comment'}, 
#skip=number of days to skip after each scrape,
#times= how many iterations to do(one iter= 1 scrape, skip days once).
#size = how many posts to scrape during each iteration.
#start= how many days back to start from.  This parameter exists so you can pick up where you left off if you 
#didn't get as many posts as you would have liked.
#pseudoverbose= prints updates after n cycles.  Default 5
#subfield = all of the specific fields from each post we will be scraping and saving.  
#other subfields can be found by visiting a subreddit on reddit.com, adding .json to the end of the url, and looking 
#through the 'data' dictionary keys
#comfields = same as subfields, but for comments, should you choose to set the param kind='comment'
    
    
    
    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(kind, subreddit, size)
    #stem is the url with certain fields missing.  The missing fields are filled using the .format method to pass 
    #parameters into a string
    #also note that we are not scraping from reddit directly.  We are scraping from a datadase of reddit posts 
    #provided by pushshift.io, a website designed to help people gather data from social media and to serve as a data
    #science teaching aid.  To donate to pushshift, visit https://pushshift.io/donations/.  
    
    mylist = []                                              #This empty list will become a list of dfs to be concatted
                                                             #into a single df
    
    count = 0                                                #count keeps track of iters.  
    for x in range(0, times):                                #loop will run for 'times=' iterations
        
        count += 1
        
        URL = '{}&after={}d'.format(stem, skip * x + start)  #This sets up which URL will be scraped.
        
        response = requests.get(URL)                         #This where we save everything scraped 
        
        assert response.status_code == 200                   #This only lets the loop continue as long as we've got 
                                                             #a status code of 200('all good')
        
        mine = response.json()['data']                       #This saves the data from json format as a variable.  
                                                             #Data is a dictionary where all content is stored.
        
        df = pd.DataFrame.from_dict(mine)                    #this saves the dict as df, separated by dictionary key
        
        mylist.append(df)                                    #appends our list.  When complete, we'll have a list of
                                                             #dataframes
        
        time.sleep(.25)                                      #time.sleep pauses our loop for n seconds.  I've set it so 
                                                             #low because we don't need to sleep when scraping from
                                                             #pushshift
        
        if count%pseudoverbose == 0:                         #This was added to print how many times we've cycled
            print(count, 'cycles complete.')                 #every n cycles.  It used to print the whole url, every
                                                             #time, but that would fill the notebook with urls and 
                                                             #make scrolling through the notebook a bit busy for my tastes.
        
    full = pd.concat(mylist)                                 #full = a full df of all info scraped
    mylist = []                                              #overwriting to dump out the list, for the sake of memory
    
    if kind == 'submission':                                 #the following is if kind is set to submission
        
        full = full[subfield]                                #this strips the df down to the fields we want
        
        full = full.drop_duplicates()                        #this drops duplicates
        
        full = full.loc[full['is_self'] == True]             #this drops rows where the main text is empty
                    
    elif kind == 'comment':                                  #the following is if kind is set to comment
                    
#         for thing in full['body']:                           #this makes it so we don't save empty rows
#             if thing != '':
                    
        full = full[comfields]                               #strips the df to the fields we want

        #full = full.drop_duplicates()                        #drops duplicates
                
    else:
        print ("kind must be 'submission' or 'comment'")     #sends user a message if they haven't set the param correctly
        
        
        
#     def get_date(created):                                   #function that retrieves timestamp from when scrape occured
#         return datetime.date.fromtimestamp(created)              
    
#     _timestamp = full['created_utc'].apply(get_date)         #creates a column of timestamps
    
#     full['timestamp'] = _timestamp                           #saves the timestamp to the df
    
    print(count, 'cycles complete.')
    print(full.shape)
    
    
    return full
    
    


#### interesting subreddits scraped for consideration included: <br>
-showerthoughts  (13,636,977 subscribers, ranked \# 24 on reddit)<br>
-askscience (15,197,492 subscribers, ranked \# 18 on reddit)<br>
-democrat/republican <br>
-space/sea 


### Decided to scrape from r/Democrats and r/Republicans, going back as many comments as possible going back to about early October 2016 (500 per day)
### The body of most of the actual submissions were links or photos, so scraped comments as a better practice for NLP.

In [None]:
#Note that the number of rows is nearly maxed out (350000 would be max) for democrats most days, 
#there are more than 500 comments.

In [31]:
dems_comment2 = query_pushshift('democrats', kind='comment', skip=1, times=700, size=500, pseudoverbose=50)

50 cycles complete.
100 cycles complete.
150 cycles complete.
200 cycles complete.
250 cycles complete.
300 cycles complete.
350 cycles complete.
400 cycles complete.
450 cycles complete.
500 cycles complete.
550 cycles complete.
600 cycles complete.
650 cycles complete.
700 cycles complete.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




700 cycles complete.
(349271, 5)


In [32]:
dems_comment2.to_csv('dems_raw_comment2.csv') 

In [33]:
reps_comment2 = query_pushshift('republicans', kind='comment', skip=1, times=700, size=500, pseudoverbose=50)

50 cycles complete.
100 cycles complete.
150 cycles complete.
200 cycles complete.
250 cycles complete.
300 cycles complete.
350 cycles complete.
400 cycles complete.
450 cycles complete.
500 cycles complete.
550 cycles complete.
600 cycles complete.
650 cycles complete.
700 cycles complete.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




700 cycles complete.
(315274, 5)


In [35]:
reps_comment2.to_csv('reps_raw_comment2.csv') 

basic observation #1:  dems more active on reddit.  Dem comments over max(by how much?, unknown), rep comments 90% of max.

In [40]:
dems_comment2.author.value_counts()[:10]

[deleted]             45662
VegaThePunisher       25070
AutoModerator          7360
Gsteel11               3365
backpackwayne          3157
therecordcorrected     2918
KubrickIsMyCopilot     2882
data2dave              2241
election_info_bot      2088
Brysynner              1671
Name: author, dtype: int64

In [41]:
reps_comment2.author.value_counts()[:10]

FranklinAbernathy     17546
[deleted]             14095
Wannabe2good          13972
AutoModerator          8995
BobcatBarry            6314
jesse11551             6153
BaronBifford           5522
chainsawx72            4213
Cuckold-doodle-doo     3516
BatMally               3346
Name: author, dtype: int64

The author value_counts suggest that author may be a good predictor, outside of NLP tasks.<br>
<br>
Also, notice the proportion of deleted to total in each.

In [44]:
dems_deleted = (45662/349271)*100
reps_deleted = (14095/315274)*100
print(f'democrat deleted comment rate: {dems_deleted}')
print(f'republican deleted comment rate: {reps_deleted}')

democrat deleted comment rate: 13.073515980427805
republican deleted comment rate: 4.470714362744787


Democrat comments are nearly 3 times as likely to be deleted

In [6]:
from datetime import datetime

#https://stackoverflow.com/questions/45140034/python-convert-seconds-to-datetime-date-and-time
#comment from 'Alex Hristov' taught me to convert datetime from seconds to utc time

#following info from google/wikipedia:
#Going to use UTC-4hrs to convert to eastern, note that this can be -5hrs too, depending on time of year.
#Note that I could make this more precise, by subtracting another hour based on conditions that fit for 
#2x yearly time changes

In [51]:
dems_comment2['created_utc'] = dems_comment2['created_utc'] - (60*60*4)  #subtracting 4 hrs from each utc in seconds

In [53]:
reps_comment2['created_utc'] = reps_comment2['created_utc'] - (60*60*4)

In [60]:
datetime.fromtimestamp(1536386494-14400).strftime("%A, %B %d, %Y %I:%M:%S")

'Friday, September 07, 2018 10:01:34'

In [68]:
#not working, getting "TypeError: cannot convert the series to <class 'int'>"

#dems_comment2['day_of_week'] = datetime.fromtimestamp(dems_comment2['created_utc']).strftime("%A")

In [92]:
#breaking down timestamp for later analysis

days = []
for x in dems_comment2['created_utc']:
    j = datetime.fromtimestamp(x).strftime("%A")
    days.append(j)

months = []
for x in dems_comment2['created_utc']:
    j = datetime.fromtimestamp(x).strftime("%B")
    months.append(j)

date = []
for x in dems_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%d"))
    date.append(j)

year = []
for x in dems_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%y"))
    year.append(j)

hour = []
for x in dems_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%I"))
    hour.append(j)

#note that I had no intention on breaking down to minute, but what if there is some weird connection
#ex, comments spike during CNN/FOX news commercial breaks.  I don't intend on examining it, but the data could
#have use

minute = []
for x in dems_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%M"))
    minute.append(j)

dems_comment2['day_of_week'] = days
dems_comment2['month'] = months
dems_comment2['date'] = date
dems_comment2['year'] = year
dems_comment2['hour'] = hour
dems_comment2['minute'] = minute

#Rearranging columns
dems_comment2 = pd.DataFrame(dems_comment2, columns = ['subreddit', 'author', 'score', 'created_utc', 
                         'day_of_week', 'month', 'date', 'year','hour', 'minute',
                         'body'])

In [105]:
#doing for reps now

days = []
for x in reps_comment2['created_utc']:
    j = datetime.fromtimestamp(x).strftime("%A")
    days.append(j)

months = []
for x in reps_comment2['created_utc']:
    j = datetime.fromtimestamp(x).strftime("%B")
    months.append(j)

date = []
for x in reps_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%d"))
    date.append(j)

year = []
for x in reps_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%y"))
    year.append(j)

hour = []
for x in reps_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%I"))
    hour.append(j)

#note that I had no intention on breaking down to minute, but what if there is some weird connection
#ex, comments spike during CNN/FOX news commercial breaks.  I don't intend on examining it, but the data could
#have use

minute = []
for x in reps_comment2['created_utc']:
    j = int(datetime.fromtimestamp(x).strftime("%M"))
    minute.append(j)

reps_comment2['day_of_week'] = days
reps_comment2['month'] = months
reps_comment2['date'] = date
reps_comment2['year'] = year
reps_comment2['hour'] = hour
reps_comment2['minute'] = minute

#Rearranging columns
reps_comment2 = pd.DataFrame(reps_comment2, columns = ['subreddit', 'author', 'score', 'created_utc', 
                         'day_of_week', 'month', 'date', 'year','hour', 'minute',
                         'body'])


In [108]:
#updating subreddits to switch subreddits to targets 1 and 0

dems_comment2['subreddit'] = 1

reps_comment2['subreddit'] = 0


In [77]:
#continually loading from kernel crashes

#reps_comment2.drop(columns=['Unnamed: 0'], inplace=True)
#dems_comment2.drop(columns=['Unnamed: 0'], inplace=True)
dems_comment2.head()

Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,1.0,VegaThePunisher,1.0,1536372000.0,Friday,September,7.0,18.0,10.0,1.0,And told us to vote.
1,1.0,IAMA_Drunk_Armadillo,1.0,1536373000.0,Friday,September,7.0,18.0,10.0,9.0,Aww did the Nazi snowflake get triggered? You ...
2,1.0,IAMA_Drunk_Armadillo,1.0,1536373000.0,Friday,September,7.0,18.0,10.0,11.0,Aww did the Nazi snowflake get triggered?
3,1.0,FyreTroll,1.0,1536374000.0,Friday,September,7.0,18.0,10.0,31.0,"Excuse me, what?\nThat logic also applies to “..."
4,1.0,FyreTroll,1.0,1536374000.0,Friday,September,7.0,18.0,10.0,34.0,Your comment made no actual factual refutation...


In [81]:
#would work if used AM/PM

'''latenight = 0
morning = 0
afternoon = 0
evening = 0

for x in dems_comment2.hour:
    if x in [22, 23, 0, 1, 2, 3]:
        latenight += 1
    elif x in [4, 5,6,7,8,9]:
        morning += 1
    elif x in [10,11,12,13,14,15]:
        afternoon += 1
    elif x in [16,17,18,19,20,21]:
        evening += 1
print(latenight/dems_comment2.shape[0], 'democrats latenight percentage')
print(morning/dems_comment2.shape[0], 'democrats morning percentage')
print(afternoon/dems_comment2.shape[0], 'democrats afternoon percentage')
print(evening/dems_comment2.shape[0], 'democrats evening percentage')'''

"latenight = 0\nmorning = 0\nafternoon = 0\nevening = 0\n\nfor x in dems_comment2.hour:\n    if x in [22, 23, 0, 1, 2, 3]:\n        latenight += 1\n    elif x in [4, 5,6,7,8,9]:\n        morning += 1\n    elif x in [10,11,12,13,14,15]:\n        afternoon += 1\n    elif x in [16,17,18,19,20,21]:\n        evening += 1\nprint(latenight/dems_comment2.shape[0], 'democrats latenight percentage')\nprint(morning/dems_comment2.shape[0], 'democrats morning percentage')\nprint(afternoon/dems_comment2.shape[0], 'democrats afternoon percentage')\nprint(evening/dems_comment2.shape[0], 'democrats evening percentage')"

In [110]:
#now will merge randomly.
df = pd.concat([dems_comment2, reps_comment2], ignore_index=True)

#learned about 'ignore_index=' from pandas.concat documentation @
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html

df = df.sample(frac=1).reset_index(drop=True) #this will shuffle all rows

In [111]:
df.head(10)

Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,1,waldrop02,11,1490791891,Wednesday,March,29,17,8,51,Him being a rude person in general doesn't eli...
1,0,nakdamink,1,1508882556,Tuesday,October,24,17,6,2,"I fully agree, Moore holds absolutely reprehen..."
2,1,HankMoodyMF,1,1511794509,Monday,November,27,17,9,55,These guys are awful for the left.
3,0,mydadisnotyourdad,1,1507227093,Thursday,October,5,17,2,11,"Not sure where the ""slam"" was."
4,0,BobcatBarry,9,1499876621,Wednesday,July,12,17,12,23,"I think ""destroys"" is a pretty strong word for..."
5,1,Credulous7,1,1495068773,Wednesday,May,17,17,8,52,Written evidence of collaboration between Russ...
6,1,Pylons,1,1535807729,Saturday,September,1,18,9,15,"Yes, I am aware. That's not what I'm saying. \..."
7,1,orr250mph,7,1481736275,Wednesday,December,14,16,12,24,Bush lied and people died.
8,1,[deleted],1,1499656564,Sunday,July,9,17,11,16,deleted ^^^^^^^^^^^^^^^^0.1027 .message here....
9,1,crosscheck87,1,1531091225,Sunday,July,8,18,7,7,"I'll take a link to where he said that, you ca..."


In [119]:
#dropping deleted and removed posts, 59757 in total

df.drop(df[df['author'] == '[deleted]'].index, inplace=True)

In [121]:
#after dropping deleted comments, classes are balanced nearly perfectly
df.subreddit.value_counts(normalize=True)

1    0.502009
0    0.497991
Name: subreddit, dtype: float64

In [122]:
df.shape

(604788, 11)

_____________________

In [None]:
dems_comment2.to_csv('dems_raw_comment2.csv') 
reps_comment2.to_csv('reps_raw_comment2.csv') 
df.to_csv('df_dem_rep_comments.csv')

_________

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup 
import regex as re

### Using a Cleaning function from class to remove any HTML, all non-letters, stopwords, and making all words lowercase.  Also, added a step to Lemmatize.

In [24]:
df.body()

Unnamed: 0.1,Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,0,1.0,waldrop02,11.0,1490792000.0,Wednesday,March,29.0,17.0,8.0,51.0,Him being a rude person in general doesn't eli...
1,1,0.0,nakdamink,1.0,1508883000.0,Tuesday,October,24.0,17.0,6.0,2.0,"I fully agree, Moore holds absolutely reprehen..."
2,2,1.0,HankMoodyMF,1.0,1511795000.0,Monday,November,27.0,17.0,9.0,55.0,These guys are awful for the left.
3,3,0.0,mydadisnotyourdad,1.0,1507227000.0,Thursday,October,5.0,17.0,2.0,11.0,"Not sure where the ""slam"" was."
4,4,0.0,BobcatBarry,9.0,1499877000.0,Wednesday,July,12.0,17.0,12.0,23.0,"I think ""destroys"" is a pretty strong word for..."


In [27]:
#Adapted from code in GA lesson nlp-i-notes-starter-code
#Step Four: Combine our cleaning into one function
#Lesson given by Matt Brems.
#added step to lemmatize.

def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(str(raw_review)).get_text()          
                                     #No clue why, but this was working and now not working after dead kernel
        
                                    #somehow, somewhere, something turned into a float.  
                                    #so by wrapping raw_review with str() in 2 steps, problem seems fixed
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_review))
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words('english'))
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Lemmatize words  not in Matt's original function
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
    
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return (' '.join(lem_words))

In [31]:
df['body'] = df.body.apply(review_to_words)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.'

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

" looks like a URL. Beautiful Soup is not an HTTP client. You sh

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to B

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

https://thefederalistpapers.org/wp-content/uploads/2017/03/imageedit_645_3117981996.jpg

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' th

  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d


LOL" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.'

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that documen

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
http://www.who.int/mediacentre/factsheets/fs313/en/
https://www.epa.gov/haps/health-and-environmental-effects-hazardous-air-pollutants" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTT

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


https://www.theguardian.com/us-news/2018/may/28/ivanka-trump-won-china-trademarks-donald-trump-zte-reversal" looks like a URL. Beautiful Soup is not an HTTP client. You shou

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

http://i.imgur.com/J1jBkzY.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % d

  ' that document to Beautiful Soup.' % decoded_markup

https://ballotpedia.org/Rob_Bishop

https://en.wikipedia.org/wiki/Rob_Bishop" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document t

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks l

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup

https://imgur.com/a/lIk0TuN" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that docu

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
" looks like a URL. Beautiful Soup is not an HTTP client. You sho

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % 

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [33]:
df.to_csv('df_dem_rep_comments.csv')

In [52]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [57]:
df.info()
#got 5 nulls somehow

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604793 entries, 0 to 604792
Data columns (total 11 columns):
subreddit      604788 non-null float64
author         604788 non-null object
score          604788 non-null float64
created_utc    604788 non-null float64
day_of_week    604788 non-null object
month          604788 non-null object
date           604788 non-null float64
year           604788 non-null float64
hour           604788 non-null float64
minute         604788 non-null float64
body           604793 non-null object
dtypes: float64(7), object(4)
memory usage: 50.8+ MB


In [9]:
df.drop(df[pd.isnull(df).any(axis=1)].index, inplace=True)

In [8]:
df.isnull().sum()

subreddit         0
author            0
score             0
created_utc       0
day_of_week       0
month             0
date              0
year              0
hour              0
minute            0
body           5525
dtype: int64

In [55]:
df.head()

Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,1.0,waldrop02,11.0,1490792000.0,Wednesday,March,29.0,17.0,8.0,51.0,rude person general eliminate idea rude higher...
1,0.0,nakdamink,1.0,1508883000.0,Tuesday,October,24.0,17.0,6.0,2.0,fully agree moore hold absolutely reprehensibl...
2,1.0,HankMoodyMF,1.0,1511795000.0,Monday,November,27.0,17.0,9.0,55.0,guy awful left
3,0.0,mydadisnotyourdad,1.0,1507227000.0,Thursday,October,5.0,17.0,2.0,11.0,sure slam
4,0.0,BobcatBarry,9.0,1499877000.0,Wednesday,July,12.0,17.0,12.0,23.0,think destroys pretty strong word whatever


In [80]:
#df values must have encoded as floats when exporting/importing
df.subreddit = df.subreddit.astype(int)
df.score = df.score.astype(int)
df.date = df.date.astype(int)
df.year = df.year.astype(int)
df.hour = df.hour.astype(int)
df.minute = df.minute.astype(int)
df.head(2)

Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,1,waldrop02,11,1490792000.0,Wednesday,March,29,17,8,51,rude person general eliminate idea rude higher...
1,0,nakdamink,1,1508883000.0,Tuesday,October,24,17,6,2,fully agree moore hold absolutely reprehensibl...


In [81]:
df.to_csv('df_dem_rep_comments.csv', encoding='utf-8')

__________

### Separating comments into individual words with count vectorizer

In [10]:
#will do this once without ngrams, once with ngrams(1,3)

vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   #stopwords already removed
                             max_features = 5000,
                             max_df=1.0, #default
                             min_df=1    #default
                            )
df_vect = vectorizer.fit_transform(df['body'])

In [14]:
#df_words = pd.DataFrame(df_vect.todense(), columns=vectorizer.get_feature_names(), index=df.index)

In [15]:
#df_words.size  #6GB?

2996315000

In [83]:
#was taking too long to export.  Probably don't need to encode and save a giant file like this.
#df_words.to_csv('df_words.csv', encoding='utf-8')

_________

### Count vectorizer with (3,3) ngrams

In [12]:
vectorizer_ng = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   #stopwords already removed
                             max_features = 5000,
                             ngram_range = (3,3),
                             max_df=1.0, #default
                             min_df=1    #default
                            )
df_vect_ng = vectorizer_ng.fit_transform(df['body'])

In [17]:
#df_words_ng = pd.DataFrame(df_vect_ng.todense(), columns=vectorizer_ng.get_feature_names(), index=df.index)

In [18]:
#df_words_ng.size

2996315000

In [84]:
#was taking too long to export.  Probably don't need to encode and save a giant file like this.
#df_words_ng.to_csv('df_words_ng.csv', encoding='utf-8')

_________

### Tfidf Vectorizer

In [14]:
dcorpus = list(df.body[(df.subreddit ==1)])

In [15]:
rcorpus = list(df.body[(df.subreddit ==0)])

In [20]:
tfidf = TfidfVectorizer(analyzer='word', max_features=300)
dem_tf = tfidf.fit_transform(dcorpus)

In [21]:
dem_tf = pd.DataFrame(dem_tf.todense(), columns=tfidf.get_feature_names())

In [30]:
dem_tf.mean().sum()      #hmmmmm, thought it would be 1, or 2.7xxxx(euler's #)

2.2432293107600088

In [32]:
dem_tf.mean().sort_values(ascending=False)[:25]

trump         0.035459
people        0.032827
like          0.029966
would         0.026215
democrat      0.024385
think         0.023376
one           0.022949
get           0.022807
right         0.021139
republican    0.020101
http          0.019694
party         0.018798
vote          0.018767
know          0.018023
need          0.017372
make          0.016513
gt            0.016385
even          0.016151
say           0.016132
thing         0.016111
election      0.016091
good          0.016050
want          0.015903
time          0.015689
com           0.014779
dtype: float64

-------------

In [33]:
Rtfidf = TfidfVectorizer(analyzer='word', max_features=300)
rep_tf = Rtfidf.fit_transform(rcorpus)

In [34]:
rep_tf = pd.DataFrame(rep_tf.todense(), columns=Rtfidf.get_feature_names())

In [35]:
rep_tf.mean().sum()

2.284270537989342

In [36]:
rep_tf.mean().sort_values(ascending=False)[:25]

trump         0.040294
like          0.032391
people        0.031002
republican    0.030129
one           0.021800
would         0.021175
http          0.020890
right         0.020443
get           0.019768
think         0.019422
know          0.018595
com           0.018415
president     0.018215
gt            0.017870
please        0.017741
say           0.017034
democrat      0.016907
obama         0.016816
good          0.016776
thing         0.016509
even          0.016207
news          0.016059
year          0.015087
make          0.015045
want          0.015040
dtype: float64

__________

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB

In [5]:
#kernel died from train_test_split
df = pd.read_csv('./df_dem_rep_comments.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,subreddit,author,score,created_utc,day_of_week,month,date,year,hour,minute,body
0,1,waldrop02,11,1490792000.0,Wednesday,March,29,17,8,51,rude person general eliminate idea rude higher...
1,0,nakdamink,1,1508883000.0,Tuesday,October,24,17,6,2,fully agree moore hold absolutely reprehensibl...
2,1,HankMoodyMF,1,1511795000.0,Monday,November,27,17,9,55,guy awful left
3,0,mydadisnotyourdad,1,1507227000.0,Thursday,October,5,17,2,11,sure slam
4,0,BobcatBarry,9,1499877000.0,Wednesday,July,12,17,12,23,think destroys pretty strong word whatever


In [12]:
df.isnull().sum()

subreddit      0
author         0
score          0
created_utc    0
day_of_week    0
month          0
date           0
year           0
hour           0
minute         0
body           0
dtype: int64

In [11]:
#somehow lose 5525 body during every export import to_csv/read_csv

In [10]:
df.drop(df[pd.isnull(df).any(axis=1)].index, inplace=True)

In [11]:
#train test split
#note, because the dataframe is over 600,000 rows, i'm going to split this 5%train 95%test
#i think that this better characterizes 'real world' data problems, and 30,000 is still a good size to train on
X_train, X_test, y_train, y_test = train_test_split(df_vect, df['subreddit'], random_state=42, test_size=.95)

## 1-RandomForestClassifier with GridSearchCV

In [15]:
grid_params = {
    'n_estimators' : [10, 100, 1000, 2000],
    'max_features': [5, 10, 50, 100]
}

gs = GridSearchCV(
    RandomForestClassifier(),
    grid_params,
    verbose = 2.5,
    n_jobs = 4
)

gs_results = gs.fit(X_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=5, n_estimators=10 .................................
[CV] max_features=5, n_estimators=10 .................................
[CV] max_features=5, n_estimators=10 .................................
[CV] max_features=5, n_estimators=100 ................................
[CV]  max_features=5, n_estimators=10, score=0.8571285542651181, total=   3.2s
[CV] max_features=5, n_estimators=100 ................................
[CV]  max_features=5, n_estimators=10, score=0.8580296355626752, total=   3.3s
[CV] max_features=5, n_estimators=100 ................................
[CV]  max_features=5, n_estimators=10, score=0.8520076098928607, total=   3.3s
[CV] max_features=5, n_estimators=1000 ...............................
[CV]  max_features=5, n_estimators=100, score=0.8986784140969163, total=  34.2s
[CV] max_features=5, n_estimators=1000 ...............................
[CV]  max_features=5, n_estimators=100, score=0.900170

[Parallel(n_jobs=4)]: Done  29 tasks      | elapsed: 14.8min


[CV]  max_features=50, n_estimators=1000, score=0.8910692831397677, total= 5.0min
[CV] max_features=100, n_estimators=1000 .............................
[CV]  max_features=100, n_estimators=100, score=0.8789547456948338, total=  32.4s
[CV] max_features=100, n_estimators=1000 .............................
[CV]  max_features=100, n_estimators=100, score=0.8684289576449384, total=  32.0s
[CV] max_features=100, n_estimators=1000 .............................
[CV]  max_features=50, n_estimators=1000, score=0.8817462701511966, total= 4.9min
[CV]  max_features=100, n_estimators=1000, score=0.8731350755982777, total= 5.3min
[CV]  max_features=100, n_estimators=1000, score=0.8747496996395675, total= 5.4min
[CV]  max_features=100, n_estimators=1000, score=0.8837605126151382, total= 5.4min


[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed: 21.3min finished


KeyboardInterrupt: 

I watched all of the outputs through verbose and noted that 5 features with 1000 estimators was the best model.  Lower features and higher estimators were always a better score over 36 attempts (3 * 4 * 3 folds).  Didn't try higher estimators in GridSearchCV, but will experiment below with 2000 estimators

In [16]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=2000, max_features=5)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
rf.score(X_train, y_train)

0.9868504488869606

In [19]:
rf.score(X_test, y_test)

0.942352011241876

In [21]:
#y_test_hat = rf.predict(X_test)

### These scores seem really good.  over 94% on test data with a 5/95 train-test-split.  I'd be happy with this as is.

-----------------

## Multinomial Naïve Bayes

In [22]:
bayes = MultinomialNB()
bayes.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
bayes.score(X_train, y_train)

0.7199212361913027

In [24]:
bayes.score(X_test, y_test)

0.6966959423853856

In [26]:
y_test_hat_bayes = bayes.predict(X_test)

---------

#### Changing alpha hyperparameter

In [30]:
bayes = MultinomialNB(alpha=.01)
bayes.fit(X_train, y_train)
bayes.score(X_train, y_train), bayes.score(X_test, y_test)

(0.7232920602075894, 0.6991779378183735)

In [27]:
bayes = MultinomialNB(alpha=.1)
bayes.fit(X_train, y_train)
bayes.score(X_train, y_train), bayes.score(X_test, y_test)

(0.7225911958081634, 0.6985807131565079)

In [28]:
bayes = MultinomialNB(alpha=10)
bayes.fit(X_train, y_train)
bayes.score(X_train, y_train), bayes.score(X_test, y_test)

(0.7028001201481827, 0.6818426137361672)

In [29]:
bayes = MultinomialNB(alpha=100)
bayes.fit(X_train, y_train)
bayes.score(X_train, y_train), bayes.score(X_test, y_test)

(0.6726295764776558, 0.6587019146320042)

#### Slight improvement with a lowered alpha, but still not good results

These scores are very low compared to the RandomForest.  I wouldn't even want to include it in a voting classifier.
It is worth mentioning that the results from Multinomial Naïve Bayes are instant!  The RandomForestClassifier ran for hours.  Still, I wouldn't choose this model.

---------

## Logistic Regression with GridsearchCV

In [32]:
#note that we can't run other solvers through gridsearch if we want to try both Lasso and Ridge regularization.
#some solvers only work with Ridge.

grid_params = {
    'solver' : ['saga'],
    'penalty': ['l1', 'l2'],
    'C' : [.1, 1.0, 10]
}

gs_log = GridSearchCV(
    LogisticRegression(),
    grid_params,
    verbose = 2.5,
    n_jobs = 4
)

gs_log_results = gs_log.fit(X_train,y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.1, penalty=l1, solver=saga ..................................
[CV] C=0.1, penalty=l1, solver=saga ..................................
[CV] C=0.1, penalty=l1, solver=saga ..................................
[CV] C=0.1, penalty=l2, solver=saga ..................................




[CV]  C=0.1, penalty=l2, solver=saga, score=0.6954345214257108, total=   1.0s
[CV] C=0.1, penalty=l2, solver=saga ..................................




[CV]  C=0.1, penalty=l2, solver=saga, score=0.7019423307969563, total=   1.0s
[CV] C=0.1, penalty=l2, solver=saga ..................................




[CV]  C=0.1, penalty=l2, solver=saga, score=0.6868929608491038, total=   0.9s
[CV] C=1.0, penalty=l1, solver=saga ..................................




[CV]  C=0.1, penalty=l1, solver=saga, score=0.6359631557869443, total=   6.5s
[CV] C=1.0, penalty=l1, solver=saga ..................................




[CV]  C=0.1, penalty=l1, solver=saga, score=0.6345248823470512, total=   6.6s
[CV] C=1.0, penalty=l1, solver=saga ..................................




[CV]  C=0.1, penalty=l1, solver=saga, score=0.6416700040048058, total=   6.7s
[CV] C=1.0, penalty=l2, solver=saga ..................................




[CV]  C=1.0, penalty=l2, solver=saga, score=0.6993392070484582, total=   0.9s
[CV] C=1.0, penalty=l2, solver=saga ..................................




[CV]  C=1.0, penalty=l2, solver=saga, score=0.7035442531037245, total=   0.9s
[CV] C=1.0, penalty=l2, solver=saga ..................................




[CV]  C=1.0, penalty=l2, solver=saga, score=0.6907980374486833, total=   1.1s
[CV] C=10, penalty=l1, solver=saga ...................................




[CV]  C=1.0, penalty=l1, solver=saga, score=0.6895274329195034, total=  41.2s
[CV] C=10, penalty=l1, solver=saga ...................................




[CV]  C=1.0, penalty=l1, solver=saga, score=0.697236684020825, total=  41.6s
[CV] C=10, penalty=l1, solver=saga ...................................




[CV]  C=1.0, penalty=l1, solver=saga, score=0.6793831981576048, total=  41.5s
[CV] C=10, penalty=l2, solver=saga ...................................




[CV]  C=10, penalty=l2, solver=saga, score=0.6998398077693232, total=   1.0s
[CV] C=10, penalty=l2, solver=saga ...................................




[CV]  C=10, penalty=l2, solver=saga, score=0.7028434120945134, total=   1.1s
[CV] C=10, penalty=l2, solver=saga ...................................




[CV]  C=10, penalty=l2, solver=saga, score=0.6907980374486833, total=   1.0s




[CV]  C=10, penalty=l1, solver=saga, score=0.697336804164998, total=  53.2s


Process ForkPoolWorker-15:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/erikgreenj/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/erikgreenj/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
 

KeyboardInterrupt: 

##### Interrupted cell, wouldn't terminate on it's own. Best score was .703

##### Will try with different solvers now

In [33]:
grid_params = {
    'solver' : ['sag', 'lbfgs', 'newton-cg'],
    'penalty': ['l2'],
    'C' : [.1, 1.0, 10],
    'max_iter' : [100, 500, 1000]
}

gs_log = GridSearchCV(
    LogisticRegression(),
    grid_params,
    verbose = 2.5,
    n_jobs = 4
)

gs_log_results = gs_log.fit(X_train,y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7247697236684021, total=   0.4s
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7286744092911493, total=   0.3s
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7190347451687193, total=   0.3s
[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............




[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.7038149594472815, total=   1.3s
[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.710853023628354, total=   1.4s
[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............




[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.7137565078093713, total=   1.4s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7247697236684021, total=   0.6s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7190347451687193, total=   0.6s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7286744092911493, total=   0.6s
[CV] C=0.1, max_iter=500, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=500, penalty=l2, solver=lbfgs, score=0.7247697236684021, total=   0.3s
[CV] C=0.1, max_iter=500, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=500, penalty=l2, solver=lbfgs, score=0.7286744092911493, total=   0.3s
[CV] C



[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.7149579495394474, total=   1.1s
[CV] C=1.0, max_iter=100, penalty=l2, solver=sag .....................




[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.7196635963155787, total=   1.1s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7611133360032039, total=   0.4s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7604124949939928, total=   0.4s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................


[Parallel(n_jobs=4)]: Done  29 tasks      | elapsed:   11.9s


[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.707019124862321, total=   1.4s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7521778311805347, total=   0.5s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=0.1, max_iter=1000, penalty=l2, solver=sag, score=0.7285742891469764, total=   6.4s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=1.0, max_iter=100, penalty=l2, solver=newton-cg, score=0.7611133360032039, total=   0.9s
[CV] C=1.0, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=1.0, max_iter=100, penalty=l2, solver=newton-cg, score=0.7601121345614738, total=   0.9s
[CV] C=1.0, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=1000, penalty=l2, solver=sag, score=0.7188344848302793, total=   6.6s
[CV] C=1.0, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=1.0, 



[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.7438926712054466, total=   6.6s
[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.7423908690428515, total=   6.5s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=sag ....................
[CV] C=1.0, max_iter=1000, penalty=l2, solver=sag ....................




[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.735255832582357, total=   6.5s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7611133360032039, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7601121345614738, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7519775708420947, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=newton-cg, score=0.7611133360032039, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=newton-cg, score=0.7601121345614738, total=   0.7s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  



[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.7154585502603124, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=sag ......................




[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.7209651581898278, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=sag ......................




[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.707419645539201, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.775330396475771, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.7789347216659992, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.7698007409632522, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................




[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.7514016820184221, total=  12.2s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_iter=100, penalty=l2, solver=newton-cg, score=0.776531838205847, total=   1.8s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................




[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.7423650745969761, total=  12.8s
[CV] C=10, max_iter=500, penalty=l2, solver=sag ......................




[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.7551061273528233, total=  13.1s
[CV] C=10, max_iter=500, penalty=l2, solver=sag ......................
[CV]  C=10, max_iter=100, penalty=l2, solver=newton-cg, score=0.7829395274329195, total=   2.0s
[CV] C=10, max_iter=500, penalty=l2, solver=sag ......................
[CV]  C=10, max_iter=100, penalty=l2, solver=newton-cg, score=0.7725042555321918, total=   2.2s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.776531838205847, total=   1.6s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.7826391670004005, total=   1.7s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.7725042555321918, total=   1.7s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................




[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7443932719263116, total=   7.1s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................




[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7464957949539447, total=   7.0s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................




[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7366576549514369, total=   7.0s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.776531838205847, total=   2.0s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.7829395274329195, total=   2.1s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.7725042555321918, total=   2.3s
[CV] C=10, max_iter=1000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=1000, penalty=l2, solver=lbfgs, score=0.776531838205847, total=   1.7s
[CV] C=10, max_iter=1000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7826391670004005, total=   1.9s
[CV] C=10, max_iter=1000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, 



[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7561073287945534, total=  14.9s




[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7558069683620344, total=  14.8s




[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7462701511965555, total=  14.0s


[Parallel(n_jobs=4)]: Done  81 out of  81 | elapsed:   55.9s finished


In [36]:
gs_log.best_params_, gs_log.best_score_

({'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'},
 0.7773253679538097)

##### Noticed an improvement with these solvers, best score was .777<br>
##### Maybe could get a better score with higher iters, because many of the tests did not converge

In [37]:
grid_params = {
    'solver' : ['sag', 'lbfgs', 'newton-cg'],
    'penalty': ['l2'],
    'C' : [.1, 1.0, 10],
    'max_iter' : [100, 500, 1000, 5000]
}

gs_log = GridSearchCV(
    LogisticRegression(),
    grid_params,
    verbose = 2.5,
    n_jobs = 4
)

gs_log_results = gs_log.fit(X_train,y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=sag .....................
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7247697236684021, total=   0.4s
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7286744092911493, total=   0.3s
[CV] C=0.1, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=lbfgs, score=0.7190347451687193, total=   0.3s
[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............




[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.710853023628354, total=   1.3s
[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.7038149594472815, total=   1.3s
[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV] C=0.1, max_iter=100, penalty=l2, solver=newton-cg ...............




[CV]  C=0.1, max_iter=100, penalty=l2, solver=sag, score=0.7137565078093713, total=   1.4s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7247697236684021, total=   0.5s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7190347451687193, total=   0.6s
[CV] C=0.1, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l2, solver=newton-cg, score=0.7286744092911493, total=   0.6s
[CV] C=0.1, max_iter=500, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=500, penalty=l2, solver=lbfgs, score=0.7247697236684021, total=   0.3s
[CV] C=0.1, max_iter=500, penalty=l2, solver=lbfgs ...................
[CV]  C=0.1, max_iter=500, penalty=l2, solver=lbfgs, score=0.7286744092911493, total=   0.3s
[CV] C=0.1, max_iter=500, penalty=l2, solver=lbfgs ...................
[CV]  

[Parallel(n_jobs=4)]: Done  29 tasks      | elapsed:   13.5s


[CV]  C=0.1, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7190347451687193, total=   0.3s
[CV] C=0.1, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]  C=0.1, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7247697236684021, total=   0.6s
[CV] C=0.1, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]  C=0.1, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7286744092911493, total=   0.6s
[CV] C=0.1, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]  C=0.1, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7190347451687193, total=   0.5s
[CV] C=1.0, max_iter=100, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=5000, penalty=l2, solver=sag, score=0.723568281938326, total=   6.0s
[CV] C=1.0, max_iter=100, penalty=l2, solver=sag .....................
[CV]  C=0.1, max_iter=5000, penalty=l2, solver=sag, score=0.7285742891469764, total=   6.0s
[CV] C=1.0, max_iter=100, penalty=l2, solver=sag .....................




[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.7149579495394474, total=   1.1s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................




[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.7195634761714057, total=   1.1s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................




[CV]  C=1.0, max_iter=100, penalty=l2, solver=sag, score=0.707019124862321, total=   1.1s
[CV] C=1.0, max_iter=100, penalty=l2, solver=lbfgs ...................
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7611133360032039, total=   0.4s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7604124949939928, total=   0.5s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=1.0, max_iter=100, penalty=l2, solver=lbfgs, score=0.7521778311805347, total=   0.5s
[CV] C=1.0, max_iter=100, penalty=l2, solver=newton-cg ...............
[CV]  C=1.0, max_iter=100, penalty=l2, solver=newton-cg, score=0.7611133360032039, total=   0.9s
[CV] C=1.0, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=1.0, max_iter=100, penalty=l2, solver=newton-cg, score=0.7601121345614738, total=   0.9s
[CV] C=1.0, max_iter=500, penalty=l2, solver=sag .....................
[CV]  C=1.0



[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.7437925510612735, total=   6.4s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=sag ....................




[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.7424909891870244, total=   6.4s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=sag ....................




[CV]  C=1.0, max_iter=500, penalty=l2, solver=sag, score=0.735355962751577, total=   6.2s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7611133360032039, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7601121345614738, total=   0.7s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=lbfgs, score=0.7519775708420947, total=   0.7s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=newton-cg, score=0.7611133360032039, total=   0.8s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=1000, penalty=l2, solver=newton-cg, score=0.7601121345614738, total=   0.7s
[CV] C=1.0, max_iter=1000, penalty=l2, solver=newton-cg ..............
[CV]  



[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.7514016820184221, total=  10.7s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=sag ....................




[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.742765595273856, total=  10.5s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=sag ....................




[CV]  C=1.0, max_iter=1000, penalty=l2, solver=sag, score=0.7550060072086504, total=  10.6s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7611133360032039, total=   0.7s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7601121345614738, total=   0.7s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=lbfgs ..................
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7519775708420947, total=   0.7s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7611133360032039, total=   0.8s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7601121345614738, total=   0.7s
[CV] C=1.0, max_iter=5000, penalty=l2, solver=newton-cg ..............
[CV]



[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.7144573488185822, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=sag ......................




[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.7203644373247897, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=sag ......................




[CV]  C=10, max_iter=100, penalty=l2, solver=sag, score=0.706518474016221, total=   1.0s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.775330396475771, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.7789347216659992, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=100, penalty=l2, solver=lbfgs, score=0.7698007409632522, total=   0.4s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_iter=100, penalty=l2, solver=newton-cg, score=0.776531838205847, total=   1.9s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_iter=100, penalty=l2, solver=newton-cg, score=0.7829395274329195, total=   1.7s
[CV] C=10, max_iter=100, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_ite



[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7441930316379656, total=   4.7s
[CV] C=10, max_iter=500, penalty=l2, solver=sag ......................




[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7464957949539447, total=   4.7s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=sag, score=0.749474316611595, total=  22.9s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=1.0, max_iter=5000, penalty=l2, solver=sag, score=0.7595114136964357, total=  24.0s
[CV] C=10, max_iter=500, penalty=l2, solver=lbfgs ....................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.776531838205847, total=   1.6s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.7826391670004005, total=   1.9s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................




[CV]  C=10, max_iter=500, penalty=l2, solver=sag, score=0.7369580454590968, total=   5.7s
[CV] C=10, max_iter=500, penalty=l2, solver=newton-cg ................
[CV]  C=10, max_iter=500, penalty=l2, solver=lbfgs, score=0.7725042555321918, total=   2.0s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.776531838205847, total=   2.3s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.7829395274329195, total=   2.1s
[CV] C=10, max_iter=1000, penalty=l2, solver=sag .....................
[CV]  C=10, max_iter=500, penalty=l2, solver=newton-cg, score=0.7725042555321918, total=   2.4s
[CV] C=10, max_iter=1000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=1000, penalty=l2, solver=lbfgs, score=0.776531838205847, total=   1.7s
[CV] C=10, max_iter=1000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, m



[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7559070885062075, total=  13.4s
[CV] C=10, max_iter=5000, penalty=l2, solver=sag .....................




[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7558069683620344, total=  13.1s
[CV] C=10, max_iter=5000, penalty=l2, solver=sag .....................




[CV]  C=10, max_iter=1000, penalty=l2, solver=sag, score=0.7459697606888955, total=  12.8s
[CV] C=10, max_iter=5000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=5000, penalty=l2, solver=lbfgs, score=0.776531838205847, total=   1.5s
[CV] C=10, max_iter=5000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7826391670004005, total=   1.6s
[CV] C=10, max_iter=5000, penalty=l2, solver=lbfgs ...................
[CV]  C=10, max_iter=5000, penalty=l2, solver=lbfgs, score=0.7725042555321918, total=   1.6s
[CV] C=10, max_iter=5000, penalty=l2, solver=newton-cg ...............
[CV]  C=10, max_iter=5000, penalty=l2, solver=newton-cg, score=0.776531838205847, total=   1.8s
[CV] C=10, max_iter=5000, penalty=l2, solver=newton-cg ...............
[CV]  C=10, max_iter=5000, penalty=l2, solver=newton-cg, score=0.7829395274329195, total=   1.6s
[CV] C=10, max_iter=5000, penalty=l2, solver=newton-cg ...............
[CV]  C=10, 

[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:  1.9min finished


In [38]:
gs_log.best_params_, gs_log.best_score_

({'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'},
 0.7773253679538097)

In [41]:
gs_log.score(X_train, y_train)

0.8652337883389514

In [42]:
logr = LogisticRegression(C=100, solver='newton-cg', max_iter=10000)
logr.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
logr.score(X_train, y_train)

0.8824550278677035

In [44]:
logr.score(X_test, y_test)

0.7965589320217812

##### With hyperparameter tuning, got better scores, but evidence of overfitting exists.  
##### Given more time to experiment, I would feed Logreg higher %s of the dataset and tune a bit more.
##### As is, still pleased enough with the results of RandomForestClassifier

---------

## Support Vector Classifier

In [45]:
from sklearn import svm

In [46]:
svc = svm.SVC(C=.1, kernel='linear')
svc.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
svc.score(X_train, y_train)

0.8080632780429196

In [48]:
svc.score(X_test, y_test)

0.7528297909713684

##### SVC scores not great, but could have promise?  Considering a GridSearchCV to try other kernel tricks but this process is computationally expensive

In [50]:
grid_params = {
    'kernel' : ['linear', 'rbf', 'sigmoid', 'precomputed'],  # all kernels except poly
    'C' : [.1, 1.0, 10],
}

gs_SVC = GridSearchCV(
    svm.SVC(),
    grid_params,
    verbose = 2.5,
    n_jobs = -1
)

gs_SVC_results = gs_SVC.fit(X_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.1, kernel=linear ............................................
[CV] C=0.1, kernel=linear ............................................
[CV] C=0.1, kernel=linear ............................................
[CV] C=0.1, kernel=rbf ...............................................
[CV] C=0.1, kernel=rbf ...............................................
[CV] C=0.1, kernel=rbf ...............................................
[CV] C=0.1, kernel=sigmoid ...........................................
[CV] C=0.1, kernel=sigmoid ...........................................
[CV] ... C=0.1, kernel=linear, score=0.7358830596716059, total= 1.4min
[CV] C=0.1, kernel=sigmoid ...........................................
[CV] ... C=0.1, kernel=linear, score=0.7303494542905777, total= 1.6min
[CV] C=0.1, kernel=precomputed .......................................
[CV] C=0.1, kernel=precomputed .......................................
[CV] C=0.1, kern

KeyboardInterrupt: 

Terminated GridSearchCV as no good results were coming out

#### Conclusion for Model selection, RandomForestClassifier was the best model of these 4 without a doubt.  I wouldn't consider putting these into a Voting Classifier, because it would likely return worse results than the RFC

--------------

# Other

### Getting total word counts for each class (top 100 words) to put in word clouds for visualization.

In [38]:
DEMvectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   #stopwords already removed
                             max_features = 300,
                             max_df=1.0, #default
                             min_df=1    #default
                            )
DEMvect = DEMvectorizer.fit_transform(df[df['subreddit'] == 1].body)

In [39]:
DEMwords = pd.DataFrame(DEMvect.todense(), columns=DEMvectorizer.get_feature_names(), index=None)

In [48]:
words = []
counts = []
for x in list(DEMwords.columns):
    #if sum(DEMwords[f'{x}'])//3871 < 8:
    words.append(x) 
    counts.append(sum(DEMwords[f'{x}'])//3871)       #floor dividing by the lowest count, to maintain proportion

count_dict = dict(zip(words, counts))
count_frame = pd.DataFrame.from_dict(count_dict, orient='index')
count_frame.sort_values(by=0, ascending=False, inplace=True)

In [49]:
#Dividing all totals by lowest count to maintain proportion but allow it to fit in the cloud.
#min(counts)
count_frame.head(2)

Unnamed: 0,0
people,15
trump,14


In [50]:
#making a proportional list, by iterating through count_dict made above

dem_cloud_list = []
for k,v in count_dict.items():
    for x in range(v):                      #We will append word(k) v times
        dem_cloud_list.append(k)

In [51]:
len(dem_cloud_list)      #this returns a proportional list

626

In [52]:
#dem_cloud_list

-------------

In [54]:
REPvectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   #stopwords already removed
                             max_features = 300,
                             max_df=1.0, #default
                             min_df=1    #default
                            )
REPvect = REPvectorizer.fit_transform(df[df['subreddit'] == 0].body)

In [55]:
REPwords = pd.DataFrame(REPvect.todense(), columns=REPvectorizer.get_feature_names(), index=None)

In [66]:
Rwords = []
Rcounts = []
for x in list(REPwords.columns):
    #if sum(REPwords[f'{x}'])//4107 < 7:
    Rwords.append(x) 
    Rcounts.append(sum(REPwords[f'{x}'])//4107)

Rcount_dict = dict(zip(Rwords, Rcounts))
Rcount_frame = pd.DataFrame.from_dict(Rcount_dict, orient='index')
Rcount_frame.sort_values(by=0, ascending=False, inplace=True)

In [67]:
Rcount_frame.head(2)

Unnamed: 0,0
trump,17
republican,13


In [68]:
rep_cloud_list = []
for k,v in Rcount_dict.items():
    for x in range(v):                      #We will append word(k) v times
        rep_cloud_list.append(k)

In [69]:
len(rep_cloud_list)

645

In [70]:
#rep_cloud_list

Put the proportional clouds in wordclouds.com and took screenshot for presentation.