# Predicting Subreddits for Posts from 'The Onion' and 'News' Subreddits

### Jupyter Notebook 1 - Scrapping posts from both subreddits using the Pushshift API

In [2]:
import requests
import pandas as pd
import time

#use r/news and r/theonion
#followed the primer here https://www.youtube.com/watch?v=AcrjEWsMi_E&feature=youtu.be but turned it into a function

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission'

### Define a function to pull post information from reddit

In [10]:
def pull_posts(subreddit, length):
    post_info = [] #A List to collect all scrapped info
    utc_date = None 
    while len(post_info) < length: #Have it keep pulling posts until I get to the desired length
        params = {
            'size' : 100,
            'subreddit': subreddit,
            'before': utc_date
        }
        url = 'https://api.pushshift.io/reddit/search/submission'
        res = requests.get(url, params)
        data = res.json()
        posts = data['data'] 
        post_info.append(posts) #each list of 100 posts goes into the post_info list
        try:
            utc_date = posts[99]['created_utc'] #try to call the date of the last post, so the next round of pulled posts are all from before the set that was just pulled
        except:
            break #this will not work for the last set as there may be less than 100 in the last set pulled
        time.sleep(3) #put 3 seconds in between each request to not crash the site
    return post_info
#While loop idea thanks to Chris

### Add post info from The Onion to my total list

In [11]:
onion_posts = pull_posts('theonion', 100)
#this will create up to 100 sets of 100 posts from The Onion subreddit

In [12]:
total_list = []
for x in onion_posts:
    for xi in x:
        total_list.append(xi)
#loop through the list of post lists and add each single post dictionary to the total list of posts

In [13]:
len(total_list) 
#check that the length is right 

7178

### Do the same steps for the news subreddit

In [15]:
news_posts = pull_posts('news', 100)
#do the same thing but for r/news

In [16]:
for x in news_posts:
    for xi in x:
        total_list.append(xi)

In [17]:
len(total_list)

17178

### Make the total list a data frame and save it

In [18]:
reddit_posts = pd.DataFrame(total_list)

In [19]:
reddit_posts.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,author_flair_background_color,author_flair_text_color,steward_reports,removed_by,updated_utc,og_description,og_title,gilded,rte_mode,link_flair_text
0,[],False,ManofTheNightsWatch,,[],,text,t2_dgjcs,False,False,...,,,,,,,,,,
1,[],False,dwaxe,,[],,text,t2_3jamc,False,True,...,,,,,,,,,,
2,[],True,Sanlear,,[],,text,t2_d0xcf,False,True,...,,,,,,,,,,
3,[],False,aresef,,[],,text,t2_5mtwj,False,True,...,,,,,,,,,,
4,[],False,aresef,,[],,text,t2_5mtwj,False,True,...,,,,,,,,,,


In [20]:
reddit_posts.shape

(17178, 80)

In [21]:
reddit_posts.isnull().sum()

all_awardings              3969
allow_live_comments        4598
author                        0
author_flair_css_class    17177
author_flair_richtext       116
                          ...  
og_description            17147
og_title                  17147
gilded                    16372
rte_mode                  17156
link_flair_text           16954
Length: 80, dtype: int64

In [22]:
reddit_posts.to_csv('./data/reddit_posts.csv')
#save the dataframe as a csv file in the data folder

I scraped posts from 'The Onion' and 'News' subreddits using the Pushshift API. I compiled all the gathered information into one DataFrame and saved it.