# Identify PEPs by Id and Occupation

Datasets contain both PEPs and RCAs (relatives and close associates). We identify PEPs by their occupation in the _source.data.occupations.occupation field by looking up each item in the array of occupations against our template of relevant occupations.

In [100]:
import pandas as pd
import re
import ast
from ast import literal_eval
import pyarrow as pa
import pyarrow.parquet as pq
import json
import numpy as np

#### Convert Ids to RCA/PEP

In [101]:
#If the dataframe is DBPedia then identify PEPs by their id. IDs that end in _numbers are RCAs. 
def pep_id(df):
    df['PEP_id'] = [False if re.search('_\d+$', id) else True for id in df['_id']]
    return df

### Extract PEP occupations

Load Keywords

In [102]:
def load_keywords():
    keywords = pd.read_csv('../../templates/PEP_keywords.csv')
    #Create a list
    keywords = keywords['occupation'].to_list()
    #Lowercase
    keywords = [keyword.lower() for keyword in keywords]
    return keywords

We remove keywords within parenthesis because we do not want to extract the keywords of an RCA's relation who is a pep. 

- Child of Joe Biden (President of the United States, Senator, Member of Congress)

In [103]:
def remove_keyword_in_parenthesis(text):
    while '(' in text and ')' in text:
        start = text.find('(')
        end = text.find(')')
        if start < end:
            text = text[:start] + text[end + 1:]
        else:
            break
    return text

Validate occupations checks if row is an array. If not, returns false, []. If it loops through each item and removes (), checks if it contains forbidden phrases, and if any word matches our keywords.

In [104]:
def validate_occupation(occupation_array, keywords):
    matching_keywords = []

    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword) + r'\b'
        exclude_phrase = ['Child of ', 'Spouse of ']
        
        #If row is an array, for each item in array, remove () and match item of array to pattern. If item fits pattern and does not have Child of/Parent of, append to our list. 
        if isinstance(occupation_array, np.ndarray):
            for item in occupation_array:
                clean_occupation_str = remove_keyword_in_parenthesis(item)
                if re.search(pattern, clean_occupation_str, re.IGNORECASE) and not any(phrase in clean_occupation_str for phrase in exclude_phrase):
                    matching_keywords.append(keyword)
        
        #For values that are NAN and not in an array
        else:
          return False, []
    
    #Return True if there's values in array, and the keyword
    return bool(matching_keywords), matching_keywords





    
    
       

    


Check excluded phrases

In [122]:
def main():
    keywords = load_keywords()
    file_path = '../../parquet/European Parliament MEPs.parquet'
    
    df = pd.read_parquet(f'{file_path}')
    
    if not df.empty: 
        if "DBPedia" in file_path:
            df = pep_id(df)
        
    occupation_fields = ['_source.data.occupations.occupation',]

   
    try:
        if '_source.data.occupations.occupation' in df.columns:
            df['has_occupation'] = df['_source.data.occupations.occupation'].apply(
                lambda x: validate_occupation(x, keywords)[0])
            df['occupation_keywords'] = df['_source.data.occupations.occupation'].apply(
                lambda x: validate_occupation(x, keywords)[1])
        elif '_source.data.occupations.occupation' not in df.columns:
            df['has_occupation'] = df['_source.data.display_fields.value'].apply(
                lambda x: validate_occupation(x, keywords)[0])
            df['occupation_keywords'] = df['_source.data.display_fields.value'].apply(
                lambda x: validate_occupation(x, keywords)[1])

    except Exception as e:

        df['has_occupation'] = False
        df['occupation_keywords'] = None

    return df   

In [114]:
df = main()

In [123]:
EveryPolitician = main()

In [126]:
EveryPolitician[['_source.data.display_fields.title',
       '_source.data.display_fields.value', 'has_occupation', 'occupation_keywords']]

Unnamed: 0,_source.data.display_fields.title,_source.data.display_fields.value,has_occupation,occupation_keywords
0,"[Additional Information, Political Group]","[A member of the European Parliament., Europe ...",True,"[member, member of, parliament, parliament, me..."
1,"[Additional Information, Political Group]","[A member of the European Parliament., Group o...",True,"[member, member of, parliament, parliament, pa..."
2,"[Additional Information, Political Group]","[A member of the European Parliament., Group o...",True,"[member, member of, parliament, parliament, me..."
3,"[Additional Information, Political Group]","[A member of the European Parliament., Europea...",True,"[member, member of, parliament, parliament, me..."
4,"[Additional Information, Political Group]","[A member of the European Parliament., Confede...",True,"[member, member of, parliament, parliament, me..."
...,...,...,...,...
746,"[Additional Information, Political Group]","[A member of the European Parliament., Europe ...",True,"[member, member of, parliament, parliament, me..."
747,"[Additional Information, Political Group]","[A member of the European Parliament., Group o...",True,"[member, member of, parliament, party, parliam..."
748,"[Additional Information, Political Group]","[A member of the European Parliament., Group o...",True,"[member, member of, parliament, parliament, pa..."
749,"[Additional Information, Political Group]","[A member of the European Parliament., Europe ...",True,"[member, member of, parliament, parliament, me..."


In [121]:
df[['_source.data.display_fields.title',
       '_source.data.display_fields.value', '_source.data.occupations.occupation', 'has_occupation', 'occupation_keywords']].query('has_occupation == True')

Unnamed: 0,_source.data.display_fields.title,_source.data.display_fields.value,_source.data.occupations.occupation,has_occupation,occupation_keywords
0,,,"[Economist, Madrid Councillor, Member Of The C...",True,"[member, member of, politician, politician, me..."
2,"[Related Url, Related Url, Related Url, Relate...","[http://joiningforces.uso.org/, http://www.bid...",[American educator and academic; Second Lady o...,True,[politician]
3,,,"[Politician, Russian politician and businessman]",True,"[politician, politician]"
4,,,"[Crown Prince Of Saudi Arabia, Governor, King ...",True,"[governor, politician, king, king, prince, gov..."
5,,,"[British politician and peer, Politician]",True,"[politician, politician]"
...,...,...,...,...,...
188339,[Related Url],[http://complyadvantage.com],[Diplomat],True,"[diplomat, diplomat]"
188340,[Related Url],[http://complyadvantage.com],[Politician],True,[politician]
188341,[Related Url],[http://complyadvantage.com],[Diplomat],True,"[diplomat, diplomat]"
188342,[Related Url],[http://complyadvantage.com],[Diplomat],True,"[diplomat, diplomat]"


In [108]:
if __name__ == "__main__":
    main()