# 01 Get Data

## Imports & Functions

In [1]:
import requests
import time
import json

In [4]:
def get_posts( sub = 'all', num_pages = 4, avoid_distinguished = True, attached = None, headers = {'User-agent':'bing1'}):
    """
    Returns a list of pages from a subreddit. 
    
    ===========================
    ======= Parameters ========
    ===========================

    sub = 'all' (default): type = string
        The subreddit you want to querry. 
        https://reddit.com/r/{sub}/ 
    -------------------------------------------------------------
    num_pages = 4 (default): type = int
        Number of pages to read from.  
        This also is the number of seconds
        this function takes to run
    -------------------------------------------------------------
    avoid_distinguished = True (default): type = bool
        Whether or not to avoid stickied, archived,
        and admin posts
    -------------------------------------------------------------
    attached = None (default): type = List
        The list that you are appending new data onto.
        Default to make a new list.  
        
    ===========================
    ========  Example =========
    ===========================    
    
    the_posts= get_posts(sub = 'jokes',
                            num_pages=1, 
                            avoid_distinguished=True)
                            
    the_posts= get_posts(sub = 'nosleep',
                            num_pages=1, 
                            avoid_distinguished=True, 
                            attached=the_posts )
    
    >>> Returns a list of ~25 posts from reddit.com/r/jokes and
                    ~25 posts from reddit.com/r/nosleep
    
    
    """
    if attached:
        posts = attached
    else:
        posts = []
    counter = 0
    after = None
    while counter < num_pages:
        if after == None:
            params = {}
        else:
            params = {'after': after}
        res = requests.get(f'https://reddit.com/r/{sub}/.json', params ,headers=headers)
        if(res.status_code!=200):
            print('invalid sub')
            return None
        the_json = res.json()
        if avoid_distinguished:
            page = [child for child in the_json['data'].get('children') if not child['data']['stickied'] and not child['data']['archived'] and not child['data']['distinguished']]
        else:
            page = the_json['data'].get('children')
        posts.extend(page)
        after = the_json['data']['after']
        counter += 1
        print(counter)
        time.sleep(1)
    return posts

## Scrape using Max's Function

In [5]:
data = get_posts('confessions',40)
data = get_posts('jokes',40,attached=data)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


## Dump the text files into Data Folder

In [6]:
with open('../Data/raw.json', 'w+') as f:
    json.dump(data, f)