In [24]:
import pandas as pd
import numpy as np
from psaw import PushshiftAPI
import datetime as dt
from bs4 import BeautifulSoup 
import requests
import re
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

%matplotlib inline

In [25]:
url = 'https://www.reddit.com/r/Nootropics.json'

In [26]:
headers = {'User-agent': 'Edubs'}

In [27]:
res = requests.get(url, headers=headers)
res.status_code

200

In [28]:
jsondict = res.json()

In [29]:
sorted(jsondict['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [30]:
[post['data']['name'] for post in jsondict['data']['children']]

['t3_5yisj3',
 't3_783dac',
 't3_a8h704',
 't3_a87nrg',
 't3_a8i7xw',
 't3_a8i0nv',
 't3_a8g4c9',
 't3_a8h8s4',
 't3_a8h227',
 't3_a89opv',
 't3_a8d0j0',
 't3_a8hmpa',
 't3_a8f3uf',
 't3_a8dpku',
 't3_a8f05p',
 't3_a8ihp9',
 't3_a8i5ma',
 't3_a8c2wg',
 't3_a8hhwl',
 't3_a8fqwj',
 't3_a8axw3',
 't3_a8eeby',
 't3_a8c2w5',
 't3_a83pdy',
 't3_a8g27j',
 't3_a8fo6c',
 't3_a8a93i']

In [31]:
param = {'after':'t3_a7v54a'}

In [32]:
requests.get(url, params=param, headers=headers)

<Response [200]>

In [33]:
posts = []
after = None
for x in range(35):
    print(x)
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/Nootropics.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        jsondict = res.json()
        posts.extend(jsondict['data']['children'])
        after= jsondict['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


In [34]:
len(set([p['data']['name'] for p in posts]))

775

In [35]:
title = []
for x in range(0, len(posts)):
    title.append(posts[x]['data']['title'])

nootropics = pd.DataFrame(title)
nootropics['target'] = 0
nootropics.drop_duplicates(inplace=True)
nootropics.to_csv('nootropics.csv')
nootropics.shape

(774, 2)

In [36]:
posts2 = []
after = None
for x in range(35):
    print(x)
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/Supplements.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        jsondict = res.json()
        posts2.extend(jsondict['data']['children'])
        after= jsondict['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


In [37]:
len(set([p['data']['name'] for p in posts2]))

877

In [38]:
title = []
for x in range(0, len(posts2)):
    title.append(posts2[x]['data']['title'])

supp = pd.DataFrame(title)
supp['target'] = 1
supp.head
supp.drop_duplicates(inplace=True)
supp.to_csv('supp.csv')
supp.shape

(876, 2)

In [42]:
df = pd.concat([nootropics, supp], axis = 0)
df = pd.DataFrame(df)
df.drop_duplicates(inplace=True)
df['target'].value_counts()
df.columns = ['title', 'target']
df.reset_index(inplace=True, drop=True)
df.shape

(1650, 2)

In [43]:
df.head()

Unnamed: 0,title,target
0,A Beginner's Guide to Nootropics,0
1,/r/Longevity - For Longevity Related Research ...,0
2,Nicotine without an MAOI is less addictive? So...,0
3,Ever wondered why fungi contain compounds that...,0
4,The Past and Future of Psychedelics with Rick ...,0


In [44]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(df['title'], df['target'])

In [45]:
vect = CountVectorizer(max_features = 500, max_df=1000,stop_words='english', analyzer='word')
X_text = vect.fit_transform(X_train)
X_trainfit = pd.DataFrame(X_text.toarray(), columns=vect.get_feature_names())
X_trainfit.sum().sort_values(ascending=False).head(10)

supplements    121
vitamin         72
does            57
best            55
oil             48
supplement      46
effects         46
magnesium       43
help            43
taking          43
dtype: int64

In [46]:
X_textvect = vect.transform(X_test)
X_testfit = pd.DataFrame(X_textvect.toarray(), columns=vect.get_feature_names())
X_testfit.sum().sort_values(ascending=False).head(10)

supplements    29
supplement     25
vitamin        22
does           20
vs             19
magnesium      18
good           16
ashwagandha    15
taking         15
nootropics     14
dtype: int64

In [47]:
lr = LogisticRegression()
lr.fit(X_trainfit, y_train)
lr.score(X_trainfit, y_train), lr.score(X_testfit, y_test)





(0.8868229587712207, 0.7312348668280871)

In [48]:
tvec = TfidfVectorizer(stop_words='english')
trainme = tvec.fit_transform(X_train) 
train_tvec = pd.DataFrame(trainme.toarray(), columns=tvec.get_feature_names())
train_tvec.sum().sort_values(ascending=False).head(10)

supplements    34.002917
vitamin        22.406917
best           17.998299
does           16.253679
supplement     16.000463
oil            15.642858
magnesium      15.404443
help           14.077036
effects        13.477539
taking         13.086812
dtype: float64

In [49]:
testme = tvec.transform(X_test) 
test_tvec = pd.DataFrame(testme.toarray(), columns=tvec.get_feature_names())
test_tvec.sum().sort_values(ascending=False).head(10)

supplements    11.157235
supplement     10.322732
magnesium       8.366257
vs              7.586044
vitamin         7.429541
good            7.219730
does            6.832328
creatine        6.634549
ashwagandha     6.082381
theanine        5.879286
dtype: float64

In [50]:
#random forest

In [51]:
rf = RandomForestClassifier()
rf_params = {'n_estimators' : [10,20,30],
             'max_depth': [None, 1,2,3,4,5],
             'min_samples_split':[2,3,4] }

In [52]:
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_text, y_train)
print(gs.best_params_, gs.best_score_)


You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.



{'max_depth': None, 'min_samples_split': 4, 'n_estimators': 10} 0.740501212611156


In [53]:
gs.score(X_text, y_train)

0.9434114793856103

In [54]:
gs.score(X_textvect, y_test)

0.6900726392251816

In [None]:
#Add charts and models
#add confusion matrix