# Project 3: Web APIs & NLP

## Part 1: Web Scrape Reddit

### Import Libraries

In [61]:
#import libraries
import requests
import pandas as pd
import numpy as np

### Pushshift API URL

In [62]:
#API to use
url = 'https://api.pushshift.io/reddit/search/submission'

### Create functions to fetch submissions from subreddits

In [63]:
def get_base_df(url, subreddit): 
    
# set params

    params = {
    'subreddit':subreddit,
    'size': 100
    }
    
    res = requests.get(url, params)

    if res.status_code != 200:
        return f'Error: {res.status_code}'
    else:
        data = res.json()
        posts = data['data']
        
    return pd.DataFrame(posts)


# update params 
def update_params(base_df, subreddit): 
    
    params = {
    'subreddit':subreddit,
    'size': 100,
    'before':base_df.iloc[-1]['created_utc']
    }
    return params 


#pull posts 

def pull_posts(url, params):
    
    res = requests.get(url, params)
    
    if res.status_code != 200:
        return f'Error: {res.status_code}'

    else:
        data = res.json()
        posts = data['data']
        
        return posts

#convert new posts to df 

def posts_to_df(posts):
    return pd.DataFrame(posts)

# add to base_df 
def update_base_df(base_df, posts):
    frame = [base_df, posts]
    base_df = pd.concat(frame)
    return base_df

#create function to update base_df with 100 posts
def total_df(base_df, subreddit, url):

    new_params = update_params(base_df, subreddit)

    new_posts = pull_posts(url, new_params)

    new_df = posts_to_df(new_posts)

    base_df = base_df.append(new_df)

    return base_df

### Web Scrape r/iPhone

In [64]:
#Set up base df 

base_df_iphone = get_base_df('https://api.pushshift.io/reddit/search/submission', 'iphone')
base_df_iphone.shape

(100, 78)

In [65]:
base_df_iphone[['subreddit', 'selftext', 'title', 'created_utc']].head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,iphone,&amp;#x200B;\n\n[I noticed each apple blue pro...,Apple's blues,1659534803
1,iphone,The top left kind of corner on my iPhone 12 is...,Blurry corner,1659534340
2,iphone,iPhone 13 pro max now or is it better to wait ...,Iphone 13 pro max or wait for iPhone 14,1659534112
3,iphone,There are some weird things going on with my m...,Switched from a Pixel 6 to iPhone 13. Now I ca...,1659533915
4,iphone,"Hey there, I’ve had iphone 4, iphone 8, and cu...",Is it worth buying iPhone pro max now or is it...,1659533779


In [66]:
for i in range(99):
    base_df_iphone = total_df(base_df_iphone, 'iphone', 'https://api.pushshift.io/reddit/search/submission')

    if i % 10 == 0:
        print(base_df_iphone.shape)
    
base_df_iphone.shape

(200, 79)
(1198, 80)
(2196, 81)
(3196, 81)
(4195, 81)
(5195, 81)
(6193, 81)
(7171, 81)
(8171, 81)
(9171, 81)


(9971, 81)

In [68]:
#looking at rows that are not duplicates
len(base_df_iphone["selftext"].unique())

6020

In [72]:
base_df_iphone.to_csv('datasets/iphone.csv', index=False)

### Web Scrape r/GooglePixel

In [69]:
#Set up base df 

base_df_pixel = get_base_df('https://api.pushshift.io/reddit/search/submission', 'googlepixel')
base_df_pixel.shape

(100, 76)

In [70]:
base_df_pixel[['subreddit', 'selftext', 'title', 'created_utc']].head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,GooglePixel,I change phones every 2-3 years and the Pixel ...,Will the Pixel 5 be able to safely last anothe...,1659535413
1,GooglePixel,My 4a's touch screen was partially not working...,Sent my pixel4a to B2X (out of warranty) and w...,1659535137
2,GooglePixel,That's it... I'm so burned out to put up with ...,Sick of the bugs in the Pixel 6,1659534933
3,GooglePixel,,Buy Facebook Ads Accounts - 100% document Veri...,1659534253
4,GooglePixel,"Hey everyone, is anyone else having a weird is...",Clock suddenly dark?,1659533888


In [71]:
for i in range(99):
    base_df_pixel = total_df(base_df_pixel, 'googlepixel', 'https://api.pushshift.io/reddit/search/submission')

    if i % 10 == 0:
        print(base_df_pixel.shape)   

base_df_pixel.shape

(199, 76)
(1198, 78)
(2198, 78)
(3197, 79)
(4197, 79)
(5197, 79)
(6196, 79)
(7195, 79)
(8195, 82)
(9193, 82)


(9990, 82)

In [73]:
#looking at rows that are not duplicates
len(base_df_pixel["selftext"].unique())

5892

In [74]:
base_df_pixel.to_csv('datasets/pixel.csv', index=False)