# Instagram Web Crawler
### Import libraries and login to Instagram
`credentials.py` contains two string variables, `username` and `password` which contains Instagram's account username and password respectively

In [None]:
from instagramapi import InstagramAPI
import numpy as np
import pandas as pd
import json
import time
import datetime
import sys
from IPython.display import clear_output
import credentials
InstagramAPI = InstagramAPI.InstagramAPI(credentials.username,credentials.password)
InstagramAPI.login();

### Get full name and respective user Id
`Instagram.API.timelineFeed()` searchs provided account's timeline and returns latest posts
`user` key of every post stores full name as `full_name`, user Id as `pk` and many more information

In [None]:
new_users_name = np.array([])
new_users_user_id = np.array([])

InstagramAPI.timelineFeed()
result = InstagramAPI.LastJson

for item in result['items']:
    user_data = item.get('user',False)
    if user_data != False:
        new_users_name = np.append(new_users_name , user_data.get('full_name',np.nan))
        new_users_user_id = np.append(new_users_user_id , str(user_data.get('pk',np.nan)))
        
print(new_users_name)
print(new_users_user_id)

### Function for getting number of tags in post
Returns number of tags in `post` sent as arguement

In [None]:
def user_tags(post):
    if 'usertags' in post:
        if 'in' in post['usertags']:
            return len(post['usertags']['in'])
        else:
            return 0
    else:
        return 0

### Function for checking if location was added on post

In [None]:
def location_present(post):
    if 'location' in post:
        return True
    else:
        return False

### Function for getting user info from user's profile
`InstagramAPI.getUsernameInfo(userId)` returns user info

If getUsernameInfo() returns error that means `user` key does not exist and hence return 0 to skip the userid

In [None]:
def get_user_info(userid):
    time.sleep(2)
    InstagramAPI.getUsernameInfo(int(float(userid)))
    
    if 'user' in InstagramAPI.LastJson:
        user_info = InstagramAPI.LastJson['user']
    else:
        print('Skipped')
        return 0
    
    URL = user_info.get('external_url',np.nan)
    if URL == "":
        URL = np.nan
    
    data = {
        'is_Private' : user_info.get('is_private',np.nan),
        'Followers' : user_info.get('follower_count',np.nan),
        'Following' : user_info.get('following_count',np.nan),
        'URL' : URL,
        'Verified' : user_info.get('is_verified',np.nan)
    }
    return data

### Function for getting information about post
Returns `dict object` that stores information about post

If we are not able to extract user info from his/her profile `get_user_info` returns 0 and hence `get_post_info` also returns 0 to skip retrieving information about this post

In [None]:
def get_post_info(post,user_data):
    timestamp_value = int(post['taken_at'])
    date_data = datetime.datetime.fromtimestamp(timestamp_value)

    data = {
        'media_id' : post.get('id',np.nan),
        'username' : post['user'].get('username',np.nan),
        'user_id' : post['user'].get('pk',np.nan),
        'Hour' : date_data.hour,
        'Date' : date_data.day,
        'Month' : date_data.month,
        'Year' : date_data.year,
        'Number_of_Tags' : user_tags(post),
        'Location_Available' : location_present(post),
        'Media_Type' : post.get('media_type',np.nan),
        'is_Private' : user_data['is_Private'],
        'Followers' : user_data['Followers'],
        'Following' : user_data['Following'],
        'External Url on Bio' : user_data['URL'],
        'Verified' : user_data['Verified'],
        'Likes' :post.get('like_count',np.nan),
    }
    return data

### Function for getting information about every post of every user Id
Not check for more than 50 more posts on each user Id and that post should have been posted in last 3 and 45 days to ensure  Followers count and Number of Following have been almost same at that time also

In [None]:
def get_data(new_users_user_id):
    
    # Array to append on post data
    data_arr = np.array([])
    
    # User Number currently getting post of
    user_number = 1
    
    # Iterate through every user-ID
    for id in new_users_user_id:
        
        # Clear the current output and print new user number
        clear_output(wait=True)
        print('User Number',user_number,sep=': ')
        
        i=1
        userposts=[]
        has_more_posts = True
        max_id=''
        
        # Get User info for current id. If not available then do not get its post
        user_data = get_user_info(id)
        if user_data == 0:
            has_more_posts = False
        
        # Get post if number of post for this user is less than 50 or more posts are available
        while (has_more_posts) and i<=50:
            
            # Get timeline feed of user
            InstagramAPI.getUserFeed(usernameId = int(float(id)),maxid=max_id)
            user_timeline = InstagramAPI.LastJson
            
            # Check if more post is key in user_timeline. If yes, then check for True/False. Else, assume no more post available
            if 'more_available' in user_timeline:
                if user_timeline['more_available'] is not True:
                    has_more_posts = False

                # Get the ID of next post
                max_id = user_timeline.get('next_max_id','')
                posts = user_timeline['items']

                # Increase the number of post considered for this user
                i = i + len(posts)

                # Checking for post that were posted between last 3 and 45
                for post in posts:
                    old = (time.time() - int(post['taken_at']))/(60*60*24)
                    if (old >= 3) & (old<=45):
                        data = get_post_info(post,user_data)
                        
                        data_arr = np.append(data_arr,data)
                        
                # Slows the script down to avoid flooding the servers
                time.sleep(2)
                
            else:
                has_more_posts = False
                
        user_number = user_number + 1
        
    return data_arr

### Run the following cell to intialise a empty userID array

In [None]:
userID = np.array([])

### Function to get user Id from names

### Second part calls `get_data` and get all posts info

`names` variable is a list of all names for which we want to search for explicitly

In [None]:
def search_user_id(names):
    ids = np.array([])
    
    # If no names present then return empty array, else get corresponding IDs and return that
    if len(names) == 0:
        return ids
    
    for name in names:
        InstagramAPI.searchUsers(name)
        user = InstagramAPI.LastJson['users'][0]
        ids = np.append(ids,user.get('pk',np.nan))
    
    return ids

names = []
userID = np.append(userID,search_user_id(names))

### Run the following cell to get user ID of all people you are following on Instagram

In [None]:
InstagramAPI.getSelfUsersFollowing()
users = InstagramAPI.LastJson['users']

for user in users:
    userID = np.append(userID,user['pk'])

### Run the following cell to get array that post information

In [None]:
data_arr = get_data(np.append(new_users_user_id,userID))

`post_df` contains the old data frame

`new_df` is the new DataFrame made from post information

Both the DataFrame are combined and stored as `train_instagram_data`

In [None]:
post_df = pd.read_csv('train_instagram_data.csv',index_col=0)

new_df = pd.DataFrame.from_records(data_arr,columns = [
        'media_id',
        'username',
        'user_id',
        'Hour',
        'Date',
        'Month',
        'Year',
        'Number_of_Tags',
        'Location_Available',
        'Media_Type',
        'is_Private',
        'Followers',
        'Following',
        'External Url on Bio',
        'Verified',
        'Likes'
])

post_df = post_df.append(new_df,ignore_index=True)
post_df.to_csv(path_or_buf='train_instagram_data.csv')