In [2]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from dataclasses import dataclass

In [3]:
df = pd.read_parquet('../../parquet/DBPedia.parquet')

In [6]:
df.columns

Index(['_id', '_source.sources.source_ids', '_source.data.aml_types.aml_type',
       '_source.data.aml_types.end_date', '_source.data.aml_types.start_date',
       '_source.data.births.max_date', '_source.data.births.min_date',
       '_source.data.births.age', '_source.data.entity_types.entity_type',
       '_source.data.genders.gender', '_source.data.hidden_fields.title',
       '_source.data.hidden_fields.value',
       '_source.data.locations.location_type', '_source.data.locations.name',
       '_source.data.names.name', '_source.data.names.name_type',
       '_source.data.names.aliases', '_source.data.names.primary_name',
       '_source.data.nationalities.country_code',
       '_source.data.occupations.occupation',
       '_source.data.number_aml_type_in_display_field', '_source.source_data',
       '_source.data.associations.association_entity_id',
       '_source.data.associations.association_name',
       '_source.data.associations.association_type',
       '_source.data.ass

In [None]:
@dataclass
class DataPreparation:
    data: pd.DataFrame
    
    def data_preparation(self):
        self.data['_source.sources.source_ids'] = self.data['_source.sources.source_ids'].apply(lambda x: ', '.join(map(str, x)))
        self.data['completeness'] = self.data['completeness'].replace(0, 'incomplete')
        return self.data
    
    def filter_rca_dbpedia(self):
        '''Filter DF removes RCAs and selects for limits df to the 4 sources that represent 97 percent of data'''
        RCA_DBPEDIA = self.data[(self.data['_source.sources.source_ids'] == 'S:FBFYW0 [DBPedia]') & (self.data['PEP_id'] == False)]
        self.data.drop(RCA_DBPEDIA.index, inplace=True)

    def select_sources(self):
        LIST_SOURCE_SOURCE_IDS = ['S:FBFYW0 [DBPedia]','S:4CU7GM [PEP Everypolitician]', 'S:8L276A [Manual PEPs]', 'S:1GYJGG [The Official Board]', 'S:MFCNUA [PEP US Diplomat list 2]']
        self.data = self.data[self.data['_source.sources.source_ids'].isin(LIST_SOURCE_SOURCE_IDS)]
        return self.data
      
    def prepare_data(self):
        self.data_preparation()
        self.filter_rca_dbpedia()
        self.select_sources()
        return self.data