In [191]:
import pandas as pd
import time
from datetime import datetime, timezone
import requests

In [192]:
pd.set_option('display.max_columns', 1_000)
pd.set_option('display.max_rows', 1_000)

In [193]:
def get_posts(params, num_req):
    url = 'https://api.pushshift.io/reddit/search/submission'
    dfs = []
    num_req = num_req
    params = params
    for i in range(num_req):
        try:
            res = requests.get(url, params)
            if res.status_code != 200:
                print('Status code:', res.status_code)
                time.sleep(5)
            data = res.json()
            posts = data['data']
            dfs.append(pd.DataFrame(posts))
            try:
                oldest = pd.DataFrame(posts).iloc[-1]['created_utc']
                #print(oldest)
            except:
                pass
            params['before'] = oldest
            #time.sleep(5)
        except:
            pass
    subreddit_df = pd.concat(dfs)
    return subreddit_df

### Retrieve posts from the r/toddlers subreddit.

In [194]:
toddlers_df = pd.DataFrame()
subreddit = 'toddlers'
params = {
    'subreddit': subreddit,
    'size': 100
}
num_req = 10

In [195]:
%time toddlers_df = get_posts(params, num_req)

CPU times: user 353 ms, sys: 25.6 ms, total: 379 ms
Wall time: 18.4 s


In [196]:
toddlers_df.shape

(1000, 72)

In [197]:
toddlers_df[['title', 'selftext']].duplicated().sum()

6

In [198]:
toddlers_df.drop_duplicates(subset=['title', 'selftext'], inplace=True)

In [199]:
toddlers_df.shape

(994, 72)

In [200]:
toddlers_df['class_target'] = 1

In [201]:
toddlers_df[['title', 'selftext', 'class_target']].head()

Unnamed: 0,title,selftext,class_target
0,My 2yo doesn’t like the potty.,Hey!\n\nI’ve been trying to potty train my dau...,1
1,Numberblocks Number Magic Run - Pre-schoolers ...,I Watch this App for Toddlers in Youtube [http...,1
2,He has us both losing it with his repetitive p...,"""See me mommy! See me daddy! See me! See me! S...",1
3,Is it inconsiderate to ask daycare to not let ...,"So, our almost 3 y/o started going to daycare ...",1
4,baby girl frock,[removed],1


In [202]:
toddlers_df.to_csv('../data/toddlers.csv', index=False)

### Retrieve posts from the r/Parenting subreddit.

In [203]:
parenting_df = pd.DataFrame()
subreddit = 'Parenting'
flair = 'Teenager 13-19 Years'
params = {
    'subreddit': subreddit,
    'size': 100
}
num_req = 100

In [204]:
%time parenting_df = get_posts(params, num_req)

CPU times: user 3.83 s, sys: 172 ms, total: 4 s
Wall time: 3min 3s


In [205]:
parenting_df.shape

(9900, 73)

In [206]:
parenting_df[['title', 'selftext']].duplicated().sum()

55

In [207]:
parenting_df.drop_duplicates(subset=['title', 'selftext'], inplace=True)

In [208]:
parenting_df.shape

(9845, 73)

In [209]:
parenting_df['class_target'] = 0

In [210]:
parenting_df[['title', 'selftext', 'class_target']].head()

Unnamed: 0,title,selftext,class_target
0,Why does my 10 month old wake up at 5am?,I just want to know if there’s light at the en...,0
1,Idk what to do?,[removed],0
2,I caught my 15 year old brother smoking I need...,"Hello everyone today I caught brother smoking,...",0
3,How would you judge this behavior of a young g...,[removed],0
4,Dear parents and children,My parents ALWAYS open any mail received in my...,0


In [211]:
parenting_df.to_csv('../data/parenting.csv', index=False)

### Retrieve posts from the r/Parenting subreddit, but only keep the posts where the flair matches "Teenager 13-19 Years".

In [212]:
parenting_df = pd.DataFrame()
subreddit = 'Parenting'
flair = 'Teenager 13-19 Years'
params = {
    'subreddit': subreddit,
    'size': 100,
    'link_flair_text' : flair
}
num_req = 100

In [213]:
%time parenting_df = get_posts(params, num_req)

CPU times: user 3.67 s, sys: 157 ms, total: 3.83 s
Wall time: 2min 20s


In [214]:
parenting_df.shape

(10000, 73)

In [215]:
parenting_df[['title', 'selftext']].duplicated().sum()

55

In [216]:
parenting_df.drop_duplicates(subset=['title', 'selftext'], inplace=True)

In [217]:
parenting_df.shape

(9945, 73)

In [218]:
parenting_df['class_target'] = 0

In [219]:
parenting_df[['title', 'selftext', 'class_target']].head()

Unnamed: 0,title,selftext,class_target
0,Why does my 10 month old wake up at 5am?,I just want to know if there’s light at the en...,0
1,Idk what to do?,[removed],0
2,I caught my 15 year old brother smoking I need...,"Hello everyone today I caught brother smoking,...",0
3,How would you judge this behavior of a young g...,[removed],0
4,Dear parents and children,My parents ALWAYS open any mail received in my...,0


In [220]:
teenagers_df = parenting_df[parenting_df['link_flair_text'] == 'Teenager 13-19 Years']

In [221]:
teenagers_df.shape

(485, 74)

In [222]:
oldest = teenagers_df.iloc[-1]['created_utc']

In [223]:
subreddit = 'Parenting'
flair = 'Teenager 13-19 Years'
params = {
    'subreddit': subreddit,
    'size': 100,
    'before' : oldest
}
num_req = 100

In [224]:
%time parenting_df = get_posts(params, num_req)

CPU times: user 3.9 s, sys: 159 ms, total: 4.06 s
Wall time: 2min 37s


In [225]:
parenting_df.shape

(10000, 71)

In [226]:
parenting_df[['title', 'selftext']].duplicated().sum()

78

In [227]:
parenting_df.drop_duplicates(subset=['title', 'selftext'], inplace=True)

In [228]:
parenting_df.shape

(9922, 71)

In [229]:
parenting_df['class_target'] = 0

In [230]:
parenting_df[['title', 'selftext', 'class_target']].head()

Unnamed: 0,title,selftext,class_target
0,2YO son about to become a big brother and does...,I’m going to have my baby by the end of the we...,0
1,7 year old suddenly scared of bridges??,We have a walking bridge not far from our plac...,0
2,Favorite books for 4-5 year old.,My 4.5 year old LOVES books and while we have ...,0
3,Why do some parents choose not to use the the ...,"So, lets start off by saying I know toddlers a...",0
4,I am so lucky.,[removed],0


In [231]:
teenagers_df = teenagers_df.append(parenting_df[parenting_df['link_flair_text'] == 'Teenager 13-19 Years'])

In [232]:
teenagers_df.shape

(1022, 75)

In [233]:
teenagers_df.to_csv('../data/teenagers.csv', index=False)

In [234]:
toddlers_df.shape, teenagers_df.shape

((994, 73), (1022, 75))