In [1]:
import pandas as pd
import datetime as dt
import time
import requests

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/edwardmendoza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/edwardmendoza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def query_pushshift(subreddit, kind = 'submission', day_window = 15, n = 150):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [3]:
mmm = query_pushshift("malementalhealth")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=105d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=135

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1125d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1140d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1155d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1170d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1185d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1215d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=1230d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=2220d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=2235d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=malementalhealth&size=500&after=2250d
Query Complete!


In [4]:
mmm.shape

(4080, 9)

In [5]:
mmm.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,I need some advice because I don’t know what t...,I hate myself and every part of me. I’m in a r...,malementalhealth,1614052809,Affectionate-Weird17,2,1,True,2021-02-22
1,I told her and was rejected,Feeling pretty devastated. Caught feelings for...,malementalhealth,1614067245,bennbatt,12,7,True,2021-02-23
2,Did Gabapentin lower your sex drive?,[deleted]\n\n[View Poll](https://www.reddit.co...,malementalhealth,1614096478,[deleted],2,0,True,2021-02-23
3,The fact that assisted suicide always excludes...,,malementalhealth,1614096905,I8banana,22,0,True,2021-02-23
4,Did social media use relieve OR increase anxie...,&amp;#x200B;\n\n[https://uclioe.eu.qualtrics.c...,malementalhealth,1614097896,txsn1m,4,1,True,2021-02-23


In [None]:
malementalhealth.shape

In [6]:
Mens_Lib = query_pushshift("MensLib")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=105d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=135d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Mens

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1215d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1230d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1245d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1260d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1275d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1290d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1305d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=MensLib&size=500&after=1335d
Querying from: https://api.pushshift.io/reddit/search/submission

In [7]:
Mens_Lib.shape

(6716, 9)

In [8]:
bro_pill = query_pushshift("bropill")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=105d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=135d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=brop

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1215d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1230d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1245d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1260d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1275d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1290d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1305d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=bropill&size=500&after=1335d
Querying from: https://api.pushshift.io/reddit/search/submission

In [None]:
#bro_pill.shape

In [None]:
combined_dfs = [malementalhealth, bro_pill, Mens_Lib]

for df in combined_dfs:
    df.columns = ['title', 'selftext', 'subreddit', 'created_utc',
                  'author', 'num_comments', 'score', 'is_self', 'timestamp']

pd.concat(combined_dfs).reset_index(drop=True)

In [5]:
#mmm.to_csv('./malementalhealth.csv', index = False)

In [12]:
#bro_pill.to_csv('./bro_pill.csv', index = False)

In [13]:
#Mens_Lib.to_csv('./Mens_Lib.csv', index = False)

In [9]:
ask_men = query_pushshift("AskMen")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=105d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=135d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=5

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1230d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1245d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1260d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1275d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1290d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1305d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1335d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=500&after=1350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

In [10]:
#ask_men.to_csv('./ask_men.csv', index = False)