In [5]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
import plotly.graph_objects as go
from dataclasses import dataclass

In [11]:
df = pd.read_parquet('../../src/parquet/DBPedia.parquet')

In [12]:
@dataclass
class DataPreparation:
    data: pd.DataFrame
    
    def data_preparation(self):
        self.data['_source.sources.source_ids'] = self.data['_source.sources.source_ids'].apply(lambda x: ', '.join(map(str, x)))
        self.data['completeness'] = self.data['completeness'].replace(0, 'incomplete')
        return self.data
    
    def filter_rca_dbpedia(self):
        '''Filter DF removes RCAs and selects for limits df to the 4 sources that represent 97 percent of data'''
        RCA_DBPEDIA = self.data[(self.data['_source.sources.source_ids'] == 'S:FBFYW0 [DBPedia]') & (self.data['PEP_id'] == False)]
        self.data.drop(RCA_DBPEDIA.index, inplace=False)

    def select_sources(self):
        LIST_SOURCE_SOURCE_IDS = ['S:FBFYW0 [DBPedia]','S:4CU7GM [PEP Everypolitician]', 'S:8L276A [Manual PEPs]', 'S:1GYJGG [The Official Board]', 'S:MFCNUA [PEP US Diplomat list 2]']
        self.data = self.data[self.data['_source.sources.source_ids'].isin(LIST_SOURCE_SOURCE_IDS)]
        return self.data
      
    def prepare_data(self):
        self.data_preparation()
        self.filter_rca_dbpedia()
        self.select_sources()
        return self.data

In [37]:
rca = df[df['PEP_id'] == False]
rca = rca.filter(like='score')
non_zero_count = (rca != 0).sum()
non_zero_count.to_dict()

{'score.age': 0,
 'score.place_of_birth': 0,
 'score.location': 1867,
 'score.picture': 0,
 'score.occupation': 223,
 'score.related_url': 0,
 'score._source.data.aml_types.end_date': 0,
 'score._source.data.aml_types.start_date': 0,
 'score._source.data.display_fields.value': 13796,
 'score.political_fields': 13796,
 'score.gender': 0,
 'score.primary': 0,
 'score.secondary': 13894,
 'score.other': 0,
 'score.data_quality': 13894,
 'score.delete_profile': 27567}