In [1]:
# Use list of SM ids to extract P ids and RSM ids

In [15]:
import requests
import traceback
import json
import timeit
import pandas as pd

In [16]:
#Load Source ids
ids = pd.read_csv('./ids.csv')
sm_ids = ids['ids'].to_list()

In [17]:
# Sorcerer removes files that we need for this analysis so we need to query Kibana and get raw data. 

url = ELASTIC SEARCH
scroll_time = "1m"
page_size = 10000

# Build the query based on the source_id.
def build_query(source_id):
    query_comp1 = {
        "size": page_size,
        "query": {
            "bool": {
                "must": [
                    {
                        "match_phrase": {
                            "source_ids_original": f"{source_id}"
                        }
                    },
                    {
                        "match_phrase": {
                            "meta.document_type_code": "SM"
                        }
                    }
                ]
            }
        },
        "sort": [{"meta.updated_utc": {"order": "desc"}}]
    }

    query = {**query_comp1}

    return query

# Build the elastic search query and fetch the API with scroll
def elastic_search(source_id):
    # Container for all retrieved documents
    all_documents = []
    query = build_query(source_id)

    try:
        # Initial search request with scroll parameter in the query string
        r = requests.post(
            url + "?scroll=" + scroll_time, json=query)

        res = r.json()

        # Extract the initial results
        hits = res.get('hits').get('hits')
        all_documents.extend(hits)

        # Get the scroll ID for the next page
        scroll_id = res.get("_scroll_id")

        while True:
            # Perform the scroll using POST request
            scroll_response = requests.post(
                "https://analytics-es.k8s.euw1.data-production-1.ivxs.uk/_search/scroll", json={"scroll": scroll_time, "scroll_id": scroll_id})
            scroll_data = scroll_response.json()

            if "hits" not in scroll_data or not scroll_data.get('hits', {}).get('hits', []):
                break  # No more results to retrieve

            # Append the new results to the existing hits
            hits = scroll_data["hits"]["hits"]
            all_documents.extend(hits)

            # Get the scroll ID for the next page
            scroll_id = scroll_data["_scroll_id"]

    except Exception as e:
        traceback.print_exc()
        print(f"Error while retrieving data: {str(e)}")

    # print(f"Total documents retrieved: {len(all_documents)}")

    return all_documents

In [18]:
db = elastic_search('S:FBFYW0')

In [19]:
db = pd.json_normalize(db)

In [24]:
db[['_id', '_source.meta.derived_entity_ids', '_source.meta.entity_id', '_source.meta.source_entity_ids']].head(5)

Unnamed: 0,_id,_source.meta.derived_entity_ids,_source.meta.entity_id,_source.meta.source_entity_ids
0,SM:S:FBFYW0:Q631864,[P:GW6OK4Q4QYE3NQD],SM:S:FBFYW0:Q631864 [DBPedia],"[MO::SM:S:FBFYW0:Q631864 [DBPedia], RSM:S:FBFY..."
1,SM:S:FBFYW0:Q311440,[P:RMPLY01QH5T6AU],SM:S:FBFYW0:Q311440 [DBPedia],"[MO::SM:S:FBFYW0:Q311440 [DBPedia], RSM:S:FBFY..."
2,SM:S:FBFYW0:Q5537893,[P:AGX9P6R5L99XJ07],SM:S:FBFYW0:Q5537893 [DBPedia],"[MO::SM:S:FBFYW0:Q5537893 [DBPedia], RSM:S:FBF..."
3,SM:S:FBFYW0:Q132596,[P:7N6M2PZLG6VL47J],SM:S:FBFYW0:Q132596 [DBPedia],"[MO::SM:S:FBFYW0:Q132596 [DBPedia], RSM:S:FBFY..."
4,SM:S:FBFYW0:Q22686_3,[P:BTWCDLKKCMI85JT],SM:S:FBFYW0:Q22686_3 [DBPedia],"[MO::SM:S:FBFYW0:Q22686_3 [DBPedia], RSM:S:FBF..."


In [119]:
#Filter db based on 2860 sm_ids
db = db[db['_id'].isin(sm_ids)]
sm_ids = db['_id']

In [96]:
#Primary id is inside  _source.source_ids_original
remove_list = lambda x: ', '.join(map(str, x))
primary_ids = db['_source.source_ids_original'].apply(remove_list)

In [107]:
db.filter(like='_source.meta')
filter_rsm = lambda x: [id for id in x if id.startswith('RSM')]
rsm_ids = db['_source.meta.source_entity_ids'].apply(filter_rsm).apply(remove_list).apply(lambda x: x.strip('[DBPedia]'))


In [123]:
pd.concat([sm_ids, rsm_ids, primary_ids], axis=1).to_excel('RSM.xlsx')