In [2]:
import pandas as pd
import numpy as np
import requests
from keys import *
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# old name: InstaAPI.ipynb

# 3.1) Get account info
## Get infos from all accounts, including faulty entries
- First, get account info with the 'info' endpoint of instagram-scraper-api2.p.rapidapi.com
- The resulting variable is
  'infos' with the columns ['name','acc_name','id', 'follower_cnt', 'following_cnt',
                             'post_cnt', 'bio', 'category']
- Note, that there are still some double names from the accounts_final variable, where
  it's not clear to this point which account is the official

In [4]:
# loading the result from the previous script 'GetAccounts.ipynb'
accounts_final = pd.read_csv('accounts_final.csv')
# delete the old index 
accounts_final = accounts_final.drop('Unnamed: 0', axis = 1)
# accounts_final

In [5]:
# choose some accounts for the purpose of explaining the code
accounts_final = accounts_final[4:10]
accounts_final

Unnamed: 0,name,acc_name,id
4,Alexander Engelhard,alexander_engelhard_csu,48111005815
5,Alexander Engelhard,alex.engelhardt,3679960301
6,Alexander Graf Lambsdorff,alexandergraflambsdorff,5760876068
7,Alexander Hoffmann,alexander.hoffmannmdb,5708286586
8,Alexander Hoffmann,lxhoffmann,1558234338
9,Ana-Maria TrƒÉsnea,amtrasnea,26039470493


In [6]:
# prepare the API call
headers = {
    'x-rapidapi-key': rapid_API_insta_key,
	'x-rapidapi-host': 'instagram-scraper-api2.p.rapidapi.com'
}
url = 'https://instagram-scraper-api2.p.rapidapi.com/v1/info'
# create an empty data frame
infos = pd.DataFrame(columns=['name','acc_name','id', 'follower_cnt', 'following_cnt',
                             'post_cnt', 'bio', 'category'])
# empty list for saving jsons
info_jsons =[]

In [7]:
# loop accounts
for j, row in accounts_final.iterrows():
    print(row['name'])
    # get the account name
    acc_name = row['acc_name']
    # empty dictionary for saving data temporarily
    info = {}
    # querystring with specified account name
    querystring = {'username_or_id_or_url':acc_name}
    
    # API call, get json
    response = requests.get(url, headers=headers, params = querystring)
    info_json = response.json()
    # save jsons to avoid double API calls in case of errors
    info_jsons.append(info_json)

    # ensure successful API call 
    if (response.status_code == 200) & (list(info_json.keys()) == ['data']):

        # save data from the json file
        info = {
        # name
        'name': row['name'],
        # account name
        'acc_name':row['acc_name'],
        # account id
        'id': row['id'],
        # number of followers
        'follower_cnt': info_json['data']['follower_count'],
        # number of persons followed
        'following_cnt': info_json['data']['following_count'],
        # number of posts
        'post_cnt': info_json['data']['media_count'],
        # Instagram bio
        'bio': info_json['data']['biography'],
        # category, e.g 'Politician' or 'Actor'
        'category': info_json['data']['category']
        }
    
        # Convert info dictionary to DataFrame and concatenate to final df infos
        info_df = pd.DataFrame([info])  # Create a DataFrame from the post dictionary
        infos = pd.concat([infos, info_df], ignore_index=True)  
        
    else:
        print('Request failed with code: ', response.status_code, '.', info_json.get('message'))

infos

Alexander Engelhard
Alexander Engelhard
Alexander Graf Lambsdorff
Alexander Hoffmann
Alexander Hoffmann
Ana-Maria TrƒÉsnea


Unnamed: 0,name,acc_name,id,follower_cnt,following_cnt,post_cnt,bio,category
0,Alexander Engelhard,alexander_engelhard_csu,48111005815,1372,101,403,üíôBundestagsabgeordneter f√ºr den Wahlkreis Neu-...,Politician
1,Alexander Engelhard,alex.engelhardt,3679960301,1725,40,121,President @gammaworldwide \nPresident VON/ GAM...,Public figure
2,Alexander Graf Lambsdorff,alexandergraflambsdorff,5760876068,17812,448,415,Botschafter der Bundesrepublik Deutschland in ...,Consulate & Embassy
3,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,2700,1083,1014,üá©üá™Mitglied des Deutschen Bundestages \nüîπParlam...,Politician
4,Alexander Hoffmann,lxhoffmann,1558234338,3859,2064,69,‚Å†Actor | Hybrid Athlete | Natty | CPT üá©...,Artist
5,Ana-Maria TrƒÉsnea,amtrasnea,26039470493,3855,2292,578,üè´ MdB und Staatssekret√§rin a.D.\nüåπ Co-Vorsitze...,Political Candidate


In [8]:
# save info
# infos.to_csv('Ergebnisse/info1.csv')

In [9]:
# load info
# infos = pd.read_csv('Ergebnisse/info1.csv')
# infos = infos.drop('Unnamed: 0', axis = 1)
# infos

## Filter account info due to bio and category
To get distinguish double account names from the offical accounts, 
- I create filters to identify accounts with political expressions:
      - bio includes buzz words like 'spd','abgeordnete','demokratie'
      - account name contains buzz words like 'mdb'
      - category can be 'Politician'
- I use filters to exclude accounts with non-political expressions:
      - categories like 'Artist','Health/beauty'
      - buzz words in the bio

In [11]:
# expressions to seach in bio for
expression = 'spd|cdu|csu|linke|afd|fdp|bsw|ndr|parlament|mdb|abgeordnete'
expression2 = 'vorsitzpolitik|bundes|wahlkreis|demokratie|minister|republik'
# categories to exclude
cat_list =['Artist', 'Health/beauty', 'Athlete', 'Digital creator','Real Estate Investment Firm',
       'Nutritionist', 'Fashion Model', 'Health & wellness website',
       'Actor', 'Medical & health', 'Photographer', 'Just for fun']
# expression to exclude
expression_ex = 'sexual|papaya'

# create mask with expressions to include
mask = (
    (infos['bio'].str.casefold().str.strip().str.contains(expression + expression2, regex=True)) | # expressions = True
    (infos['acc_name'].str.casefold().str.strip().str.contains('mdb|mdl')) |                       # mdb/mdl in acc_name = True
    (infos['category'] == 'Politician') |                                                          # category politician = True
    (infos['bio'] == '')                                                                           # include empty bios
)
# create mask with expressions to exclude
mask2 =(
    (infos['category'].isin(cat_list)) |                                                          # these categories = False
    (infos['bio'].str.casefold().str.strip().str.contains(expression_ex, regex=True, na=False))  # exclude these = False
)

# apply mask to infos
infos_final = infos.loc[(mask & ~mask2),:]


In [12]:
# save infos_final
# infos_final.to_csv('Ergebnisse/infos_final.csv')

# load infos final
#infos_final = pd.read_csv('Ergebnisse/infos_final.csv')
#infos_final = infos_final.drop('Unnamed: 0',axis = 1)

In [13]:
infos_final.head()

Unnamed: 0,name,acc_name,id,follower_cnt,following_cnt,post_cnt,bio,category
0,Alexander Engelhard,alexander_engelhard_csu,48111005815,1372,101,403,üíôBundestagsabgeordneter f√ºr den Wahlkreis Neu-...,Politician
2,Alexander Graf Lambsdorff,alexandergraflambsdorff,5760876068,17812,448,415,Botschafter der Bundesrepublik Deutschland in ...,Consulate & Embassy
3,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,2700,1083,1014,üá©üá™Mitglied des Deutschen Bundestages \nüîπParlam...,Politician
5,Ana-Maria TrƒÉsnea,amtrasnea,26039470493,3855,2292,578,üè´ MdB und Staatssekret√§rin a.D.\nüåπ Co-Vorsitze...,Political Candidate


## Create infos_show variable for the web app
- add party information to the variable infos
- cleaning to have only info that is necessary for the user

In [15]:
# load election results to get party information
party = pd.read_csv('election2021.csv')
# only keep name and party info
party = party.loc[:,['name','party']]
party = party.astype({'party':'category'})
party.head()

Unnamed: 0,name,party
0,Achim J√ºrgen Post,SPD
1,Adis Ahmetovic,SPD
2,Agnes Monika Brugger,GR√úNE
3,Albert Robert Rupprecht,CSU
4,Albert Stegemann,CDU


In [16]:
# add party to infos_final and make pretty
infos_show = infos_final.merge(party, on='name')
# only info necessary for the user
infos_show = infos_show[['name', 'acc_name','party', 'follower_cnt', 'following_cnt', 'post_cnt','bio']]
# clearer names
infos_show = infos_show.rename(columns={'acc_name':'account'})
# infos_show.to_csv('infos_show.csv')
infos_show.head()

Unnamed: 0,name,account,party,follower_cnt,following_cnt,post_cnt,bio
0,Alexander Engelhard,alexander_engelhard_csu,CSU,1372,101,403,üíôBundestagsabgeordneter f√ºr den Wahlkreis Neu-...
1,Alexander Graf Lambsdorff,alexandergraflambsdorff,FDP,17812,448,415,Botschafter der Bundesrepublik Deutschland in ...
2,Alexander Hoffmann,alexander.hoffmannmdb,CSU,2700,1083,1014,üá©üá™Mitglied des Deutschen Bundestages \nüîπParlam...
3,Ana-Maria TrƒÉsnea,amtrasnea,SPD,3855,2292,578,üè´ MdB und Staatssekret√§rin a.D.\nüåπ Co-Vorsitze...


# 3.2) Get posts
- First, get posts of every single account back a set date by using the posts-endpoint
  of the instagram-scraper-api2.p.rapidapi.com.
- With one API call, 12 posts are pulled. If the account contains more posts, the
  'pagination_token' is given and refers to the next API request
- The resulting variable is
  'posts' with the columns ['name','acc_name','id','shortcode','date', 'media_type',
                              'like_cnt','comment_cnt', 'vid_view_count','comment','url']

In [18]:
# empty df for posts
posts = pd.DataFrame(columns=['name','acc_name','id','shortcode','date', 'media_type',
                              'like_cnt','comment_cnt', 'vid_view_count',
                              'comment','url'])
# specifiy url
url = 'https://instagram-scraper-api2.p.rapidapi.com/v1.2/posts'

# empty list for saving jsons (account level)
post_jsons_accounts = []

# header for API call
headers = {
	'x-rapidapi-key': rapid_API_insta_key,
	'x-rapidapi-host': 'instagram-scraper-api2.p.rapidapi.com'
}

In [19]:
# specify to which date posts should be extracted
exit_time = pd.to_datetime('2024-12-01 00:00:00')

# loop the accounts with j 
for j, row in infos_final.iterrows():
    
    # Set flag for exit time to False (continues pulling posts)
    before_period = False
    # Set flag for account has more posts to True (continues pulling posts)
    has_next = True
    # Track the current status 
    print(row['name'])
    # Get the Instagram ID from accounts df
    account_id = row['id']
    
    # set querystring for first API call
    querystring = {'username_or_id_or_url': account_id,
                   'pagination_token': None}
    
    # create empty list for saving jsons of one account
    post_jsons = []
    # track iteration of a pulled pages
    n = 0
    
    # loop one instagram account with an infinity loop 
    while True:     # needs to break at a timestamp
        
        if has_next:    # check if the account has a next page

            # API request
            response = requests.request('GET', url, headers=headers, params=querystring)

            # ensure a successful API call
            if response.status_code == 200:
                # Track pulled pages
                print('Page: ', n)
                
                post_json = response.json()
                # save all posts from one account
                post_jsons.append(post_json)    
                
                # identifier for the next posts
                querystring['pagination_token'] = post_json['pagination_token']
                # Set flag if account has next page/more posts
                if post_json['pagination_token']:
                    has_next = True
                
                post ={} # empty dictionary for the current 12 posts
                
                # loop all posts from one request (max 12 at a time)
                for item in post_json['data']['items']:
                    
                    # convert timestamp of post
                    time =  pd.to_datetime(item['taken_at'], unit = 's')
                    # Exit loop if time is before the given date and NOT pinned
                    if (time < exit_time) & (item['is_pinned']==False):
                        before_period = True
                        break
                                
                    # write info into temporal dictionary 
                    post =  {
                            # name of politician
                            'name': row['name'],
                            # account name
                            'acc_name':row['acc_name'],
                            # account id
                            'id':row['id'],
                            # shortcode, url identifier
                            'shortcode':item['code'],
                            # time
                            'date': time,
                            # media type
                            'media_type':item['media_name'],
                            # count likes
                            'like_cnt': item['like_count'],
                            # count comments
                            'comment_cnt': item['comment_count'],
                            # view count, or NaN if not available
                            'vid_view_count': item.get('play_count', np.NaN),
                            # comment/caption
                            'comment': item['caption'].get('text', np.NaN),
                            # url of the media, see if video_url or get display url
                            'url': item.get('video_url', item['thumbnail_url'])

                            # # new metrics
                            # # share counts
                            # 'share_cnt': item.get('share_count', np.NaN),
                            # # mentions (tagged users)
                            # 'mentions_list': item['caption'].get('mentions', np.NaN),
                            # # hashtags
                            # 'hashtags_list': item['caption'].get('hashtags', np.NaN)
                            }
                
                    # Convert post dictionary to DataFrame and concatenate to posts
                    post_df = pd.DataFrame([post])  # Create a DataFrame from the post dictionary
                    posts = pd.concat([posts, post_df], ignore_index=True)  # Append to the posts DataFrame
                    
            else:
                print(f'Request failed with status code: ', response.status_code)
                break
                
        n+=1 # count for pages
        
        # Exit infinity loop 
        if not has_next:   # break if has next = False
            break
        if before_period: # break if before_period = True
            break

    # save json with every account
    post_jsons_accounts.append(post_jsons)    

# save posts variable
# posts.to_csv('posts.csv')

posts

Alexander Engelhard
Page:  0
Alexander Graf Lambsdorff


  posts = pd.concat([posts, post_df], ignore_index=True)  # Append to the posts DataFrame


Page:  0
Alexander Hoffmann
Page:  0
Page:  1
Ana-Maria TrƒÉsnea
Page:  0


Unnamed: 0,name,acc_name,id,shortcode,date,media_type,like_cnt,comment_cnt,vid_view_count,comment,url
0,Alexander Engelhard,alexander_engelhard_csu,48111005815,C2W1dPxr0HO,2024-01-21 09:11:10,post,68,0,,Neues Profilbanner! \n\nSchaut gerne auf meine...,https://scontent-fra5-2.cdninstagram.com/v/t51...
1,Alexander Engelhard,alexander_engelhard_csu,48111005815,C2W1aupruPs,2024-01-21 09:10:49,post,40,0,,Neues Profilbanner! \n\nSchaut gerne auf meine...,https://scontent-fra5-2.cdninstagram.com/v/t51...
2,Alexander Engelhard,alexander_engelhard_csu,48111005815,C2W1X-oLEOt,2024-01-21 09:10:26,post,103,0,,Neues Profilbanner! \n\nSchaut gerne auf meine...,https://scontent-fra5-1.cdninstagram.com/v/t51...
3,Alexander Engelhard,alexander_engelhard_csu,48111005815,DDPpklttavJ,2024-12-06 17:00:45,album,52,1,,Die CSU s√§ht Zukunft ‚Äì f√ºr die Landwirtschaft ...,https://scontent-fra3-2.cdninstagram.com/v/t51...
4,Alexander Engelhard,alexander_engelhard_csu,48111005815,DDCDWQ4NenI,2024-12-01 10:18:54,reel,124,1,2158.0,"Besonders sch√∂n ist es, wenn der erste Advent ...",https://scontent-fra3-1.cdninstagram.com/o1/v/...
5,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,DDWqKs-IPfb,2024-12-09 10:20:38,post,36,3,,Verbrecher geh√∂ren in den Knast und Verm√∂gen a...,https://scontent-fra3-1.cdninstagram.com/v/t51...
6,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,DDUCYSEofn7,2024-12-08 09:54:29,post,120,0,,üïØÔ∏èüïØÔ∏è Wie l√§uft bisher euer Advent? Habt ihr et...,https://scontent-fra3-1.cdninstagram.com/v/t51...
7,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,DDSV4MzoVBs,2024-12-07 18:11:34,reel,91,3,2266.0,Auf einen Gl√ºhwein ! #marktheidenfeld #weihnac...,https://scontent-fra3-1.cdninstagram.com/o1/v/...
8,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,DDPmvOQoMMh,2024-12-06 16:35:59,post,57,0,,Kommen Wir ins Gespr√§ch!!!!! #marktheidenfeld ...,https://scontent-fra5-2.cdninstagram.com/v/t51...
9,Alexander Hoffmann,alexander.hoffmannmdb,5708286586,DDPhXkCIHtH,2024-12-06 15:53:13,reel,130,11,4127.0,"Was diese Woche TOP und FLOP der Woche war, er...",https://scontent-fra3-1.cdninstagram.com/o1/v/...


# 3.3) Postprocessing
Some postprocessing steps are needed to ensure the smooth operation of the web app
## Handling large dataframes
- As the database is large, some steps are needed:
- saving memory by changing the types to the minimum types necessary


In [21]:
# function for changing datatypes of posts dataframes 
def change_types(df):
    # change strings to category
    df['name'] = df['name'].astype('category')
    df['acc_name'] = df['acc_name'].astype('category')
    df['media_type'] = df['media_type'].astype('category')
    # save integers with the minimum bytes possible
    df['id'] = df['id'].astype('uint64')    
    df['like_cnt'] = df['like_cnt'].astype('int32')
    df['comment_cnt'] = df['comment_cnt'].astype('uint16')
    # fill NaNs with zeros
    df['vid_view_count'] = df['vid_view_count'].fillna(0)
    df['vid_view_count'] = df['vid_view_count'].astype('uint32')
    # change date to datetime 
    df['date'] = pd.to_datetime(df['date'])
    # show 
    df.info()
    return df

def change_types2(df):

    # fill NaNs with zeros
    df['vid_view_count'] = df['vid_view_count'].fillna(0)
    # change date to datetime 
    df['date'] = pd.to_datetime(df['date'])  

    # change datatypes
    df = df.astype({
                    # change strings to category
                    'media_type':'category',
                    # save integers with the minimum number of bytes possible
                    'id':'uint64',   
                    'like_cnt':'int32',
                    'comment_cnt':'uint16',
                    'vid_view_count':'uint32'
    })
                    
    # show 
    df.info()
    return df

In [22]:
# load posts and use function
posts = pd.read_csv('posts_all.csv')
posts = posts.drop('Unnamed: 0',axis=1)
posts = change_types2(posts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308762 entries, 0 to 308761
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   name            308762 non-null  object        
 1   acc_name        308762 non-null  object        
 2   id              308762 non-null  uint64        
 3   shortcode       308762 non-null  object        
 4   date            308762 non-null  datetime64[ns]
 5   media_type      308762 non-null  category      
 6   like_cnt        308762 non-null  int32         
 7   comment_cnt     308762 non-null  uint16        
 8   vid_view_count  308762 non-null  uint32        
 9   comment         308415 non-null  object        
 10  url             308751 non-null  object        
dtypes: category(1), datetime64[ns](1), int32(1), object(5), uint16(1), uint32(1), uint64(1)
memory usage: 19.7+ MB


## Create posts_show for the app
- add party information to the variable posts
- cleaning to have only info that is necessary for the user

In [24]:
# merge party information
posts_show = posts.copy().merge(party, on='name')
# create instagram url link using the account name and the shortcode
posts_show['webpage'] = 'https://www.instagram.com/' + posts['acc_name'] + '/p/' + posts['shortcode']
# sort df
posts_show = posts_show[['name','party','date','like_cnt','comment_cnt','vid_view_count','comment','webpage']]
# rename
posts_show = posts_show.rename(columns={'like_cnt':'likes',
                                       'comment_cnt':'comments',
                                       'vid_view_count':'video_views'})
# save as variable
# posts_show.to_csv('posts_show.csv')
posts_show.head()

Unnamed: 0,name,party,date,likes,comments,video_views,comment,webpage
0,Achim J√ºrgen Post,SPD,2024-11-02 07:55:20,211,4,0,"U3-Betreuungsquote, Armutsgef√§hrdungsquote, Le...",https://www.instagram.com/achim_p/p/DB3IIFKtII5
1,Achim J√ºrgen Post,SPD,2024-11-01 12:29:25,25,0,0,Auf Einladung von Gesch√§ftsf√ºhrer Rudi Mantler...,https://www.instagram.com/achim_p/p/DB1Csn8qXGf
2,Achim J√ºrgen Post,SPD,2024-10-31 19:23:40,110,0,0,Leistungstr√§ger:innen kommen nicht nur im Nade...,https://www.instagram.com/achim_p/p/DBzNTzhtLm_
3,Achim J√ºrgen Post,SPD,2024-10-31 11:19:00,17,0,0,Vom 18. bis zum 20.12. finden unter der Schirm...,https://www.instagram.com/achim_p/p/DByV17MKpg6
4,Achim J√ºrgen Post,SPD,2024-10-29 19:36:27,38,2,0,Nach √ºber 70 Jahren ist das Kraftwerk Heyden v...,https://www.instagram.com/achim_p/p/DBuFLwTN62E


In [25]:
posts_show.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308762 entries, 0 to 308761
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   name         308762 non-null  object        
 1   party        308762 non-null  category      
 2   date         308762 non-null  datetime64[ns]
 3   likes        308762 non-null  int32         
 4   comments     308762 non-null  uint16        
 5   video_views  308762 non-null  uint32        
 6   comment      308415 non-null  object        
 7   webpage      308762 non-null  object        
dtypes: category(1), datetime64[ns](1), int32(1), object(3), uint16(1), uint32(1)
memory usage: 12.7+ MB


## Save posts in smaller chunks
- To handle the large dataframe within the webapp, the posts variable is split
  into smaller chunks
- when loading the variable later, this helps improving performance

In [27]:
# saving into csv. with smaller size
var = posts_show
# define the chunksize
chunksize = 30000
# empty list for appending filenames
filenames=[]
# defines a list with row boundaries 
fromto = list(range(0,len(var),chunksize)) + [len(var)]

for i in range(0,len(fromto)-1):
    # define path 
    path = f'post_chunks{i}.csv'
    # saving the filenames into a list
    filenames.append(path)
    # saves only rows within the defined boundaries
    #var.loc[fromto[i]:fromto[i+1]-1].to_csv(path)

# save list of chunk filenames
filenames_df = pd.DataFrame(filenames,columns=['path'])
# filenames_df.to_csv('PostFilenames.csv', index=False)