# Project 3 Data Scraping

In [50]:
import pandas as pd
import requests 
import time

from nltk.tokenize import RegexpTokenizer

In [51]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [52]:
# 'before' parameter set as 0000hr 00min 00sec on the 18th March so as to recent enough for the posts 
# to still be topical but not with a gap of ~ a week for submissions to be moderated
params = {
    'subreddit': 'depression',
    'size': 100,
    'before': 1647561600
}


In [53]:
res = requests.get(url,  params)

In [54]:
data = res.json()

In [55]:
pd.options.display.width = 1200
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 100

In [57]:
tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")

## Depression loop to get posts

In [60]:
%%time

#### ~31 MINUTES RUNTIME! ####

data_list = []
while len(data_list) < 10000:
    res = requests.get(url,  params)
    data = res.json()
    for _ in range(len(data['data'])):
        try:
            """
            filtering data to reduce cleaning required in later stages
            - avoiding scraping removed/deleted posts
            - using length of tokens to ensure posts have some degree of substance         
            """
            if data['data'][_]['title'] != '[removed]' and \
               data['data'][_]['title'] != '[deleted]' and \
               len(tokenizer.tokenize(data['data'][_]['title'])) > 0 and \
               len(tokenizer.tokenize(data['data'][_]['title'])) < 16 and \
               data['data'][_]['selftext'] != '[removed]' and \
               data['data'][_]['selftext'] != '[deleted]' and \
               len(tokenizer.tokenize(data['data'][_]['selftext'])) > 5 and \
               len(tokenizer.tokenize(data['data'][_]['selftext'])) < 401:

                data_list.append((
                    data['data'][_]['title'].lower(), 
                    data['data'][_]['selftext'].lower()
                ))
            else: 
                pass
            """
            In the event of a KeyError, ValueError ,or ChunkedEncodingError due to a break in the data streaming 
            of the API, the loop will find the last post it processed correctly and restart from there
            """
        except KeyError:
            pass
        except ValueError:
            pass
        except ChunkedEncodingError:
            pass
            
    params = {
            'subreddit': 'depression',
            'size': 100,
            'before': data['data'][-1]['created_utc']
            }
    time.sleep(5)
    print(f"{len(data_list)} posts gathered.")

73 posts gathered.
145 posts gathered.
218 posts gathered.
288 posts gathered.
352 posts gathered.
409 posts gathered.
475 posts gathered.
540 posts gathered.
612 posts gathered.
680 posts gathered.
745 posts gathered.
809 posts gathered.
874 posts gathered.
936 posts gathered.
1002 posts gathered.
1063 posts gathered.
1127 posts gathered.
1195 posts gathered.
1263 posts gathered.
1329 posts gathered.
1395 posts gathered.
1465 posts gathered.
1529 posts gathered.
1594 posts gathered.
1666 posts gathered.
1741 posts gathered.
1804 posts gathered.
1862 posts gathered.
1933 posts gathered.
1988 posts gathered.
2043 posts gathered.
2110 posts gathered.
2180 posts gathered.
2247 posts gathered.
2324 posts gathered.
2389 posts gathered.
2459 posts gathered.
2531 posts gathered.
2591 posts gathered.
2659 posts gathered.
2730 posts gathered.
2802 posts gathered.
2868 posts gathered.
2936 posts gathered.
2999 posts gathered.
3069 posts gathered.
3135 posts gathered.
3205 posts gathered.
3270 po

In [61]:
# Number of depression posts
dep_count = len(data_list)

### Saving Scraped r/Depression Data as a `.csv` file

In [68]:
#dataframe for depression posts
depression = pd.DataFrame(data_list[:dep_count], columns=['title', 'selftext'])

In [69]:
depression.to_csv('depression.csv', index_label = False)

## r/SuicideWatch loop to get posts

In [66]:
#### RUN THIS CODE IF R/SUICIDEWATCH SCRAPING BELOW HAS A CONNECTION ISSUE AND YOU NEED TO REASSIGN
#### dep_count WITHOUT RUNNING THE r/Depression DATA SCRAPING LOOP

depression = pd.read_csv('depression.csv')
data_list = [(x, y) for x, y in zip(depression['title'], depression['selftext'])]
dep_count = len(data_list)

In [67]:
%%time

#### ~33 MINUTES RUNTIME! ####

# 'before' parameter set as 0000hr 00min 00sec on the 18th March so as to recent enough for the posts 
# to still be topical but not with a gap of ~ a week for submissions to be moderated
params = {
         'subreddit': 'suicidewatch',
         'size': 100,
         'before': 1647561600 
         }

# Loop until 10000

while len(data_list) < 2 * dep_count:
    res = requests.get(url,  params)
    data = res.json()
    for _ in range(len(data['data'])):
        try:
            """
            filtering data to reduce cleaning required in later stages
            - avoiding scraping removed/deleted posts
            - using length of tokens to ensure posts have some degree of substance
                - the minimum and maximum values for the title tokens and selftext tokens were chosen 
                  based on previous scraping efforts
                - histograms of previous sscraping efforts indicated that the majority of posts were within these 
                  parameters
                - while we would endeavour to reduce bias in our models by ignoring outliers, we still have to 
                  treat every data point as credible even though they might not be accepted were we dealing with 
                  a different subreddit i.e. posts with single-word titles
            """
            if data['data'][_]['title'] != '[removed]' and \
               data['data'][_]['title'] != '[deleted]' and \
               len(tokenizer.tokenize(data['data'][_]['title'])) > 0 and \
               len(tokenizer.tokenize(data['data'][_]['title'])) < 16 and \
               data['data'][_]['selftext'] != '[removed]' and \
               data['data'][_]['selftext'] != '[deleted]' and \
               len(tokenizer.tokenize(data['data'][_]['selftext'])) > 5 and \
               len(tokenizer.tokenize(data['data'][_]['selftext'])) < 401:

                data_list.append((
                    data['data'][_]['title'].lower(), 
                    data['data'][_]['selftext'].lower(),
                ))
                """
                When we have scraped as many posts from r/suicidewatch as we did from r/depression,
                we will end the operation
                """
                if len(data_list) == 2 * dep_count:
                    break
                    
            else:
                pass
            """
            In the event of a KeyError, ValueError ,or ChunkedEncodingError due to a break in the data streaming 
            of the API, the loop will find the last post it processed correctly and restart from there
            """
        except KeyError:
            pass
        except ValueError:
            pass
        except ChunkedEncodingError:
            pass
            
    params = {
            'subreddit': 'suicidewatch',
            'size': 100,
            'before': data['data'][-1]['created_utc']
            }
    time.sleep(5)
    print(f"{len(data_list)} posts gathered.")

10084 posts gathered.
10133 posts gathered.
10188 posts gathered.
10257 posts gathered.
10325 posts gathered.
10393 posts gathered.
10457 posts gathered.
10528 posts gathered.
10597 posts gathered.
10669 posts gathered.
10740 posts gathered.
10803 posts gathered.
10870 posts gathered.
10935 posts gathered.
11005 posts gathered.
11063 posts gathered.
11134 posts gathered.
11203 posts gathered.
11266 posts gathered.
11335 posts gathered.
11400 posts gathered.
11472 posts gathered.
11539 posts gathered.
11607 posts gathered.
11671 posts gathered.
11737 posts gathered.
11793 posts gathered.
11858 posts gathered.
11919 posts gathered.
11992 posts gathered.
12064 posts gathered.
12130 posts gathered.
12186 posts gathered.
12255 posts gathered.
12319 posts gathered.
12388 posts gathered.
12464 posts gathered.
12525 posts gathered.
12594 posts gathered.
12655 posts gathered.
12724 posts gathered.
12795 posts gathered.
12858 posts gathered.
12934 posts gathered.
12993 posts gathered.
13059 post

In [36]:
data['data'][-1]['created_utc']

1646235450

### Saving Scraped r/SuicideWatch Data as a `.csv` file

In [70]:
#dataframe for suicidewatch posts
suicidewatch = pd.DataFrame(data_list[dep_count:], columns=['title', 'selftext'])

In [71]:
suicidewatch.to_csv('suicidewatch.csv', index_label = False)