In [1]:
# import libraries
import pandas as pd
import requests
import regex as re
import numpy as np
# import API keys
from keys import *
# set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Getting the names of politicians
- to get the politicians names, the elections2021.csv is  used
  (which was created in another script)
- for explaining the code only two politicians are searched

In [3]:
# get names to search for from the last elections
df = pd.read_csv('election2021.csv')
search_names=list(df['name'])

# for explaining the code, I use only two names
search_names = search_names[0:2]
search_names

['Achim Jürgen Post', 'Adis Ahmetovic']

In [5]:
# the API works best if only the first and last name of the politician is used
search_names2=[]
# only take first and last name
i = 0
for names in search_names:
    search_names2.append(search_names[i].split()[0] + ' ' + search_names[i].split()[-1])
    i+=1
search_names2

['Achim Post', 'Adis Ahmetovic']

# Search for instagram accounts with given names
- first the instagram accounts are searched by using the
  instragram-scraper API from rapid API 
- the resulting dataframe looks like:
  accounts=['search_name','name','acc_name','id']
- search_name should be unique, while name is the name as found be the API
  and acc_name is the account name in instagram
- Note that only a examplary search is shown here

In [35]:
accounts = pd.DataFrame(columns=['search_name','name','acc_name','id'])  # empty dataframe
j=0
# url of search users endpoint
url = 'https://instagram-scraper-api2.p.rapidapi.com/v1/search_users'
# header API call
headers = {
	'x-rapidapi-key': key_scraper,
	'x-rapidapi-host': 'instagram-scraper-api2.p.rapidapi.com'
  }

In [37]:
# loop different search_names/politicians
for name in search_names2:
    j+=1     # iteration counter
    print(name)
    # API request
    querystring ={'search_query':name}
    response = requests.get(url, headers=headers, params=querystring)
    user = response.json()
    
    account = {} # empty dictionary for every account
    
    # check for successfull API request
    if (response.status_code == 200) & (list(user.keys()) == ['data']):
        # loop for finding the right account
        for item in user['data']['items']:  # loops all entries from one account
        
            # take all verified, not private users 
            if (item['is_verified'] == 1) & (item['is_private'] == False):
                account =  {'search_name':name,
                          'name':item['full_name'],
                          'acc_name':item['username'],
                          'id':item['id'] }
                
                # Convert account dictionary to DataFrame and append
                account_df = pd.DataFrame([account])  # Create a DataFrame from the account dictionary
                accounts = pd.concat([accounts, account_df], ignore_index=True)  # Append to the accounts DataFrame
                
    else:
        print(f'For {name} (in iteration {j}) no entry found.')
    
accounts

Achim Post
Adis Ahmetovic


Unnamed: 0,search_name,name,acc_name,id
0,Achim Post,"Achim Post, MdB",achim_p,1979223691
1,Adis Ahmetovic,Adis Ahmetovic,adis.a93,25699539826


# Cleaning of accounts
- When searching for all politicians, in some cases multiple or wrong accounts were found
- In a first attempt of cleaning, the real names (search_names) are compared to the names given
  in the instagram account
- for explainatory purposes, here the original account data (accounts_raw.csv) is used, which used a
  slighly different account name searching algorithm than above
- first a uniform encoding of the names has to be ensured due to the usage of 'ä,ü,ö'

In [43]:
import unicodedata
accounts = pd.read_csv('accounts_raw.csv')
accounts = accounts.drop('Unnamed: 0', axis=1)

# Function to normalize strings (to get uniform encoding of ö,ä,ü)
def normalize_string(s):
    return unicodedata.normalize('NFC', s)
    
accounts['search_name'] = accounts['search_name'].apply(normalize_string)

accounts.head(20)

Unnamed: 0,search_name,name,acc_name,id
0,Achim Jürgen Post,The Washington Post,washingtonpost,2754610
1,Achim Jürgen Post,Post United,postunited,2972897324
2,Achim Jürgen Post,New York Post,nypost,225193095
3,Achim Jürgen Post,The Jerusalem Post,thejerusalem_post,474857542
4,Achim Jürgen Post,PostSecret,postsecret,572220796
5,Achim Jürgen Post,Quinten Post,quintenpost_,3033694721
6,Achim Jürgen Post,POST ARCHIVE FACTION (PAF),postarchivefaction,3579407134
7,Achim Jürgen Post,National Post,nationalpost,1697666
8,Achim Jürgen Post,Post Cereals,post_cereals,6983606644
9,Achim Jürgen Post,Un Posto Al Sole Rai,unpostoalsolerai3,6551842933


In [55]:
# find first and last name and see if its in the given account name

for i,row in accounts.iterrows():
    
    s1 = row['search_name'].split()[0]  # first name
    s2 = row['search_name'].split()[-1] # last  name
    txt = row['name'] # name as stated in instagram
    
    # create a faketext if no name is stated
    if pd.isna(txt):
        txt = 'buffer'
    # set flag accounts['IsReal']
    if (re.findall(s1, txt) != []) & (re.findall(s2, txt) != []):
        accounts.loc[i,'IsReal'] = True
    else:
        accounts.loc[i,'IsReal'] = False
accounts.head(20)

Unnamed: 0,search_name,name,acc_name,id,IsReal
0,Achim Jürgen Post,The Washington Post,washingtonpost,2754610,False
1,Achim Jürgen Post,Post United,postunited,2972897324,False
2,Achim Jürgen Post,New York Post,nypost,225193095,False
3,Achim Jürgen Post,The Jerusalem Post,thejerusalem_post,474857542,False
4,Achim Jürgen Post,PostSecret,postsecret,572220796,False
5,Achim Jürgen Post,Quinten Post,quintenpost_,3033694721,False
6,Achim Jürgen Post,POST ARCHIVE FACTION (PAF),postarchivefaction,3579407134,False
7,Achim Jürgen Post,National Post,nationalpost,1697666,False
8,Achim Jürgen Post,Post Cereals,post_cereals,6983606644,False
9,Achim Jürgen Post,Un Posto Al Sole Rai,unpostoalsolerai3,6551842933,False


In [49]:
# only use accounts where name fits search_name
accounts2 = accounts.loc[accounts['IsReal']==True,:]
# drop rows
accounts2 = accounts2.drop(['IsReal','name'], axis=1)
accounts2.head(10)

Unnamed: 0,search_name,acc_name,id
13,Achim Jürgen Post,achim_p,1979223691
28,Adis Ahmetovic,adis.a93,25699539826
29,Albert Stegemann,albert.stegemann,25376883363
30,Alexander Bartz,bundesbartz,45486851663
31,Alexander Engelhard,alexander_engelhard_csu,48111005815
32,Alexander Engelhard,alex.engelhardt,3679960301
33,Alexander Graf Lambsdorff,alexandergraflambsdorff,5760876068
34,Alexander Hoffmann,alexander.hoffmannmdb,5708286586
35,Alexander Hoffmann,lxhoffmann,1558234338
40,Ana-Maria Trăsnea,amtrasnea,26039470493


So far I identified all instagram accounts, where the searched name is equal to
the first and last name stated on the instagram account, which are verified and 
not private. There are still double entries (e.g Alexander Engelhard). This is 
dealt with in the next section, after pulling account infos.
Note: not all politicians have an instagram account.