# Project 3.02: Webscraping from Google Pixel Reddit
---

In [1]:
# Import libraries
import requests
import time
import pandas as pd
import random

from tqdm.notebook import trange, tqdm

### Test pull and review of reddit post

In [2]:
# Google pixel url
url = 'https://www.reddit.com/r/GooglePixel/.json'

In [3]:
# reddit shuts down all Python scripts from accessing its API.
"""Modify our request bit to make it not use the default user agent."""
header = {'User-agent': 'Pony Inc 1.0'}
res = requests.get(url, headers=header)

In [4]:
res.status_code

200

In [5]:
# json is a program agnostic format for structuring data
# Parse it into a dictionary
reddit_dict1 = res.json()

In [6]:
# Review sorted keys
sorted(reddit_dict1.keys())

['data', 'kind']

In [7]:
reddit_dict1['kind']
# Information for key: 'kind' is limited

'Listing'

In [8]:
# Review sorted keys of data
sorted(reddit_dict1['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [9]:
reddit_dict1['data']['children'][0]['data']
# Children key is of interest to us (where the posts are)

{'approved_at_utc': None,
 'subreddit': 'GooglePixel',
 'selftext': 'This is the weekly photo megathread. Photos captured with your Pixel (or other Google devices) posted outside of this thread are not allowed. Also, please mention the device you took the photo with. For more pictures, check out r/Pixelography.\n\n**\\#teampixel**\n\n*An archive of past photo megathreads can be found [here.](https://www.reddit.com/r/GooglePixel/wiki/photos)*  \n*To return to the Superthread, [click here](https://reddit.com/r/GooglePixel/about/sticky?num=2).*',
 'author_fullname': 't2_6l4z3',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'Weekly #madebygoogle Photos Megathread - May 07 2020',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/GooglePixel',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': None,
 'hide_score': False,
 'name': 't3_gf88f8',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 

In [10]:
# Analyzing first post, Class label (target)
print(reddit_dict1['data']['children'][0]['data']['subreddit'])

# Title of post
print(reddit_dict1['data']['children'][0]['data']['title'])

# Text of post
reddit_dict1['data']['children'][0]['data']['selftext']

GooglePixel
Weekly #madebygoogle Photos Megathread - May 07 2020


'This is the weekly photo megathread. Photos captured with your Pixel (or other Google devices) posted outside of this thread are not allowed. Also, please mention the device you took the photo with. For more pictures, check out r/Pixelography.\n\n**\\#teampixel**\n\n*An archive of past photo megathreads can be found [here.](https://www.reddit.com/r/GooglePixel/wiki/photos)*  \n*To return to the Superthread, [click here](https://reddit.com/r/GooglePixel/about/sticky?num=2).*'

In [11]:
testposts = [p['data'] for p in reddit_dict1['data']['children']]
df_test = pd.DataFrame(testposts)
df_test

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,created_utc,num_crossposts,media,is_video,event_start,post_hint,preview,event_end,event_is_live,link_flair_template_id
0,,GooglePixel,This is the weekly photo megathread. Photos ca...,t2_6l4z3,False,,0,False,Weekly #madebygoogle Photos Megathread - May 0...,[],...,1588864000.0,0,,False,,,,,,
1,,GooglePixel,*If you were redirected here from a removed po...,t2_zmjf4,False,,0,False,"The May 2020 Superthread: Pixel 4a, Battery, ""...",[],...,1588345000.0,0,,False,1588345000.0,self,{'images': [{'source': {'url': 'https://extern...,1588360000.0,False,
2,,GooglePixel,It was supposed to take up to 10 days. But it ...,t2_10j65hn2,False,,0,False,"So I had some troubles with my phone, and sent...","[{'e': 'text', 't': 'Pixel 3a'}]",...,1589184000.0,0,,False,,,,,,bd247e74-7476-11e9-b802-0e2121dcd2a2
3,,GooglePixel,Yall weren't kidding about the battery. It's p...,t2_zfetx,False,,0,False,Had my pixel 4 for a few days already and holy...,"[{'e': 'text', 't': 'Pixel 4'}]",...,1589188000.0,0,,False,,,,,,8697ce76-f2dc-11e9-8412-0eae23347634
4,,GooglePixel,,t2_bkfyp,False,,0,False,Pixel 3a black screen after overnight charging,"[{'e': 'text', 't': 'Pixel 3a'}]",...,1589108000.0,0,{'reddit_video': {'fallback_url': 'https://v.r...,True,,hosted:video,{'images': [{'source': {'url': 'https://extern...,,,bd247e74-7476-11e9-b802-0e2121dcd2a2
5,,GooglePixel,Is anyone expecting the 4a to be announced tom...,t2_2rv4s7me,False,,0,False,Will the Pixel 4a be announced tomorrow?,[],...,1589187000.0,0,,False,,,,,,
6,,GooglePixel,Where can I get my hands on them in Canada. E...,t2_kzmev,False,,0,False,Pixel Buds 2 in Canada?,[],...,1589177000.0,0,,False,,,,,,
7,,GooglePixel,Phone is for my mom as a gift as her OG pixel ...,t2_7aopy,False,,0,False,Pixel 2 XL worth it in 2020?,"[{'e': 'text', 't': 'Pixel 2 XL'}]",...,1589170000.0,0,,False,,,,,,0a8dc2bc-3b5f-11e8-b18b-0e6314b679fc
8,,GooglePixel,"There's not better way to say it than that, my...",t2_534i1ize,False,,0,False,My Pixel Imprint just stopped reading altogeth...,"[{'e': 'text', 't': 'Pixel 3'}]",...,1589184000.0,0,,False,,,,,,baf9bb90-e2ff-11e8-9d1c-0ec050ca02ae
9,,GooglePixel,Vodafone Germany leaked two weeks ago that Pi...,t2_5ed2p65l,False,,0,False,Pixel 4a maybe delayed,[],...,1589197000.0,0,,False,,self,{'images': [{'source': {'url': 'https://extern...,,,


In [12]:
# text for each post
df_test['selftext']

0     This is the weekly photo megathread. Photos ca...
1     *If you were redirected here from a removed po...
2     It was supposed to take up to 10 days. But it ...
3     Yall weren't kidding about the battery. It's p...
4                                                      
5     Is anyone expecting the 4a to be announced tom...
6      Where can I get my hands on them in Canada. E...
7     Phone is for my mom as a gift as her OG pixel ...
8     There's not better way to say it than that, my...
9      Vodafone Germany leaked two weeks ago that Pi...
10    Just like the title says, my daughter has trie...
11    I've had my pixel 4xl for about 3 months now a...
12    Good Morning \n\nThis morning I attempted to u...
13    Some apps like Netflix and Hulu are opening ve...
14    1\`. I know it has that feature where you can ...
15    Hi guys I dropped my pixel 3a XL in water, abo...
16    Hello! I tripped and fell on a hike. As a resu...
17    I didnt really follow the pixel 4 after la

In [13]:
# features for each post
pd.DataFrame(testposts).columns

Index(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved',
       'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext',
       ...
       'created_utc', 'num_crossposts', 'media', 'is_video', 'event_start',
       'post_hint', 'preview', 'event_end', 'event_is_live',
       'link_flair_template_id'],
      dtype='object', length=109)

In [14]:
pd.DataFrame(testposts).to_csv('testpull2.csv', index = False)

In [15]:
df_check = pd.read_csv('testpull2.csv')
print(df_check.shape)
df_check.head()

(27, 109)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,created_utc,num_crossposts,media,is_video,event_start,post_hint,preview,event_end,event_is_live,link_flair_template_id
0,,GooglePixel,This is the weekly photo megathread. Photos ca...,t2_6l4z3,False,,0,False,Weekly #madebygoogle Photos Megathread - May 0...,[],...,1588864000.0,0,,False,,,,,,
1,,GooglePixel,*If you were redirected here from a removed po...,t2_zmjf4,False,,0,False,"The May 2020 Superthread: Pixel 4a, Battery, ""...",[],...,1588345000.0,0,,False,1588345000.0,self,{'images': [{'source': {'url': 'https://extern...,1588360000.0,False,
2,,GooglePixel,It was supposed to take up to 10 days. But it ...,t2_10j65hn2,False,,0,False,"So I had some troubles with my phone, and sent...","[{'e': 'text', 't': 'Pixel 3a'}]",...,1589184000.0,0,,False,,,,,,bd247e74-7476-11e9-b802-0e2121dcd2a2
3,,GooglePixel,Yall weren't kidding about the battery. It's p...,t2_zfetx,False,,0,False,Had my pixel 4 for a few days already and holy...,"[{'e': 'text', 't': 'Pixel 4'}]",...,1589188000.0,0,,False,,,,,,8697ce76-f2dc-11e9-8412-0eae23347634
4,,GooglePixel,,t2_bkfyp,False,,0,False,Pixel 3a black screen after overnight charging,"[{'e': 'text', 't': 'Pixel 3a'}]",...,1589108000.0,0,{'reddit_video': {'fallback_url': 'https://v.r...,True,,hosted:video,{'images': [{'source': {'url': 'https://extern...,,,bd247e74-7476-11e9-b802-0e2121dcd2a2


In [16]:
# 'after' contains the id of last post for current pull.
# Anything that is after '?' of url is a query string (key=value)
reddit_dict1['data']['after']

't3_gha96d'

### Actual Reddit WebScrap

In [17]:
# Parameters for 1_000 reddit pulls
url = 'https://www.reddit.com/r/GooglePixel/.json'
header = {'User-agent': 'Pony Inc 1.0'}

In [18]:
# get 1_000 posts; reddit pulls approx. 25 posts per request
# Set posts as empty list
# after by default is None
posts = []
after = None
# Extend posts per pull request
# Provide feedback via progress bar, url and time interval per pull
for _ in trange(40, desc='pull'):
    """Set params to 'empty' if after is 'None'."""
    if after == None:
        param = {}
    else:
        param = {'after': after}
    """Print query string of each pull's last post."""
    print(f"https://www.reddit.com/r/GooglePixel/.json?after{after}")
    
    res = requests.get(url, params=param, headers=header)
    """Break res status and stop request pull if status code is not 200."""
    
    if res.status_code == 200:
        reddit_gp = res.json()
        """track and print current number of posts pulled."""
        current_posts = [p['data'] for p in reddit_gp['data']['children']] 
        print("No. of posts pulled: " + str(len(current_posts)))
        
        """Extend the children list with new incoming ones."""
        posts.extend(current_posts)
        
        """Set after from None to that of last post(of each pull)."""
        after = reddit_gp['data']['after']
    else:
        print(res.status_code)
        break
    
    if _ > 0:
        
        current_df = pd.DataFrame(posts).to_csv('reddit_gp.csv', index = False)
    else:
        pd.DataFrame(posts).to_csv('reddit_gp.csv', index = False)
        
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(1,5)
    print(sleep_duration)
    time.sleep(sleep_duration)

HBox(children=(FloatProgress(value=0.0, description='pull', max=40.0, style=ProgressStyle(description_width='i…

https://www.reddit.com/r/GooglePixel/.json?afterNone
No. of posts pulled: 27
4
https://www.reddit.com/r/GooglePixel/.json?aftert3_gha96d
No. of posts pulled: 25
5
https://www.reddit.com/r/GooglePixel/.json?aftert3_gh6sp5
No. of posts pulled: 25
2
https://www.reddit.com/r/GooglePixel/.json?aftert3_gh9kwk
No. of posts pulled: 25
1
https://www.reddit.com/r/GooglePixel/.json?aftert3_ggizl8
No. of posts pulled: 25
3
https://www.reddit.com/r/GooglePixel/.json?aftert3_gge9cd
No. of posts pulled: 25
3
https://www.reddit.com/r/GooglePixel/.json?aftert3_gg0h1j
No. of posts pulled: 25
5
https://www.reddit.com/r/GooglePixel/.json?aftert3_gg2me3
No. of posts pulled: 25
2
https://www.reddit.com/r/GooglePixel/.json?aftert3_gg1sfk
No. of posts pulled: 25
4
https://www.reddit.com/r/GooglePixel/.json?aftert3_gfrrd4
No. of posts pulled: 25
3
https://www.reddit.com/r/GooglePixel/.json?aftert3_gfn5zd
No. of posts pulled: 25
5
https://www.reddit.com/r/GooglePixel/.json?aftert3_gfj041
No. of posts pulled: 25

In [19]:
len(posts)

980

In [20]:
# Check csv file
df_read = pd.read_csv('./reddit_gp.csv')

In [21]:
df_read.shape

(980, 113)

In [22]:
df_read.tail()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,event_start,post_hint,preview,event_end,event_is_live,link_flair_template_id,media_metadata,poll_data,crosspost_parent_list,crosspost_parent
975,,GooglePixel,So my wireless buds I've been using finally ga...,t2_4nm3kucn,False,,0,False,Test drive Pixel Buds 2,[],...,,,,,,,,,,
976,,GooglePixel,I'm in love with the Pixel-phones so far. My P...,t2_30g2z7cr,False,,0,False,Anti-conspiracy comfort needed,[],...,,,,,,,,,,
977,,GooglePixel,I'm not even using the new pixel buds. They're...,t2_l7kh9,False,,0,False,Why is my Pixel 3 XL connecting to the New Pix...,"[{'e': 'text', 't': 'Pixel Buds'}]",...,,,,,,d2f582a6-e2ff-11e8-b5ec-0eb37d706444,,,,
978,,GooglePixel,Google support is honestly the worse. I hope s...,t2_15w1cw37,False,,0,False,Anyone pixel 4xl knows the correct Smallest Wi...,[],...,,,,,,,,,,
979,,GooglePixel,"So I bought pixel 4 after my phone died, and I...",t2_u0kwp,False,,0,False,Changed from pixel 2 to 4 and it feels really ...,[],...,,,,,,,,,,
