In [1]:
# default_exp instagram

# Instagram Scraping

<br>

### Imports

In [2]:
#export
import json
import pandas as pd

import instaloader

import os
import time
import dotenv

<br>

### Loading Environment Variables

We need to read in a username and password for instagram, here we've stored them as environment variables

In [3]:
_ = dotenv.load_dotenv('../.env')

<br>

We'll then use these to initialise the instagram loader

In [4]:
#export
def initialise_loader():
    user = os.getenv('INSTA_USER')
    password = os.getenv('INSTA_PSWD')
    
    loader = instaloader.Instaloader()

    if (user is not None) and (password is not None):
        loader.login(user, password) 
    
    return loader

In [5]:
loader = initialise_loader()

loader

<instaloader.instaloader.Instaloader at 0x234151ad448>

<br>

### Loading Posts

We'll start by loading the posts

In [6]:
#export
def load_profile_posts(loader, profile='percythevizslaadventures'):    
    percy_profile = instaloader.Profile.from_username(loader.context, profile)
    posts = percy_profile.get_posts()
    
    return posts

<br>

We'll load the posts, then extract the first post which will be used to construct our content extractor functions

In [7]:
%%time

posts = load_profile_posts(loader)

for post in posts:
    pass

post

Wall time: 13.9 s


<Post B7G5-fiHpCL>

<br>

### Content Extractors

The first content we'll extract is the date of the post

In [8]:
#export
post_to_date = lambda post: pd.to_datetime(post.date).strftime('%Y-%m-%d %H:%M')

In [9]:
post_to_date(post)

<br>

Next we'll extract the caption

In [10]:
#export
def post_to_caption(post, remove_hashtags=True):
    caption = post.caption
    
    if caption is None:
        return ''

    if remove_hashtags == True:
        sep = ' #'
        hashtag_text = sep + sep.join(post.caption_hashtags)
        caption = caption.replace(hashtag_text, '')

    return caption

In [11]:
post_to_caption(post)

<br>

And finally we'll extract the media url

In [12]:
#export
def post_to_media_url(post):
    if post.is_video:
        url = post.video_url
    else:
        url = post.url

    return url

In [13]:
post_to_media_url(post)

<br>

### Full-Scrape Wrapper

Now we need to repeat this for all posts

In [18]:
#export
def extract_post_content(posts, latest_recorded_post_date=None):
    post_content = []

    for post in posts:
        post_date = post_to_date(post)
        
        if latest_recorded_post_date is not None:
            if pd.to_datetime(post_date) < latest_recorded_post_date:
                return post_content
        
        post_content += [{
            'date': post_date,
            'caption': post_to_caption(post),
            'media_url': post_to_media_url(post)
        }]
        
    return post_content

def retrieve_posts(loader, fp):
    df_posts = pd.read_csv(fp)
    latest_recorded_post_date = pd.to_datetime(df_posts['date'].max())
    
    posts = load_profile_posts(loader)
    
    if posts == []:
        return df_posts
    
    post_content = extract_post_content(posts, latest_recorded_post_date)

    df_posts = pd.DataFrame(post_content).append(df_posts)
    
    return df_posts

In [19]:
df_posts = retrieve_posts(loader, '../data/percy_posts.csv')

df_posts.head()

JSON Query to api/v1/media/2448978791175956406/info/: 429 Too Many Requests [retrying; skip with ^C]
Requests within last 10/11/20/22/30/60 minutes grouped by type:
                              other:    3    3    3    3    3    3
   472f257a40c653c64c666ce877d59d2b:    3    3    3    3    3    3
 *                           iphone:    1    1    1    1    1    1
Instagram responded with HTTP error "429 - Too Many Requests". Please
do not run multiple instances of Instaloader in parallel or within
short sequence. Also, do not use any Instagram App while Instaloader
is running.
The request will be retried in 666 seconds, at 02:36.
JSON Query to api/v1/media/2448978791175956406/info/: 429 Too Many Requests [retrying; skip with ^C]
Requests within last 10/11/20/22/30/60 minutes grouped by type:
                              other:    0    0    3    3    3    3
   472f257a40c653c64c666ce877d59d2b:    0    0    3    3    3    3
 *                           iphone:    1    1    2    2    2  

added


JSON Query to api/v1/media/2448978791175956406/info/: 429 Too Many Requests Unable to fetch high quality image version of <Post CH8hTw-gie2>.


Unnamed: 0,date,caption,media_url
0,2020-11-23 19:50,Lazy play with squeaky ball while chiilaxing.,https://scontent-lhr8-2.cdninstagram.com/v/t51...
0,2020-11-23 19:50,Lazy play with squeaky ball while chiilaxing.,https://scontent-lhr8-2.cdninstagram.com/v/t51...
1,2020-11-13 14:43,"Living my best life, playing with pals.\nTilly...",https://scontent-lht6-1.cdninstagram.com/v/t51...
2,2020-11-12 17:09,Trio of trouble😁 a brief rest after a good cha...,https://scontent-lht6-1.cdninstagram.com/v/t51...
3,2020-11-08 18:43,Chilly paddle today,https://scontent-lht6-1.cdninstagram.com/v/t51...


<br>

Finally we'll save the post records

In [20]:
df_posts.to_csv('../data/percy_posts.csv', index=False)

In [21]:
#hide
from nbdev.export import *
notebook2script()

Converted 01-instagram-scraping.ipynb.
Converted 02-whatsapp-integration.ipynb.
Converted 03-pipeline.ipynb.
