# Generating Metadata and Linking Similar Dataset

Combining everything into a metadata dataset using the scraper functions, generating a dataset matching datasets by identical columns and matching datasets using description embeddings. Them formatting to be used in D3.js to create network diagrams.

In [None]:
#| default_exp core.generate_Metadata

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
#| export
from database_compendium.core.ONS_scraper_functions import *
from database_compendium.core.Nomis_scraper_functions import *
from database_compendium.core.insolvency_stats_scrapers import *
from database_compendium.core.police_data_scrapers import *
from database_compendium.core.NHS_QualityOutcomes_scrapers import *
from database_compendium.core.column_matching import *
from database_compendium.core.similarity_functions import *

import numpy as np
import requests
import pandas as pd
import math
import openai

In [None]:
#| export
def createMetadata(save_file=False):

    """ ONS Data """
    
    print("Loading ONS...")
    
    # Takes over 70 mins to run
    titles, descriptions = get_ONS_datasets_titles_descriptions()
    urls = get_ONS_datasets_urls()
    long_desc = get_ONS_long_description()
    
    latest_release = []
    cols = []
    col_data = []
    count = 0
    
    for url in urls:
        response = requests.get(url)
        try: 
            latest_release.append(response.json()['release_date'])
        except:
            latest_release.append(float('nan'))
    
        try:
            cols.append(find_ONS_cols(url))
        except:
            cols.append('')
    
        try:
            col_data.append(find_ONS_cols_and_unique_vals(url))
        except:
            col_data.append('')
    
        
        count +=1
    
    ONS_df = pd.DataFrame({'Title': titles, 'Description': descriptions, 
                           'Long_description': long_desc, 'Columns': cols, 
                           'Unique_parameters': col_data, 'Latest_release': latest_release})
    
    
    """ Nomis Data """
    
    print("Loading Nomis...")
    
    # Takes around 50 mins
    titles, descriptions, l_descriptions = get_nomis_datasets_titles_descriptions()
    latest_release = get_nomis_last_updated()
    cols = np.empty(len(titles))
    params = get_nomis_datasets_parameters()
    
    nomis_df = pd.DataFrame({'Title': titles, 'Description': descriptions, 
                             'Long_description': l_descriptions, 'Columns': cols, 
                             'Unique_parameters': params, 'Latest_release': latest_release})


    """ Monthly Insolvency Statistics """
    
    print("Loading insolvency stats")
    cols = []
    col_data = []
    insolvency_stats, long_desc = get_insolvency_stats()
    titles = insolvency_stats.keys()
    
    # The descriptions on latest releases are all the same so making a repeated list
    description = [get_mis_description()] * len(titles)
    latest_release = [get_mis_last_updated()] * len(titles)
    long_desc = [long_desc] * len(titles)
    
    for title in titles:
        cols.append(list(insolvency_stats[title].columns))
        col_data.append(get_insolvency_unique_column_vals(insolvency_stats[title]))
    
    insolvency_df = pd.DataFrame({'Title': titles, 'Description': description, 
                              'Long_description': long_desc, 'Columns': cols, 
                              'Unique_parameters': col_data, 'Latest_release': latest_release})

    
    """ Police Data - currently only for Bethnal green so is commented out """
    
    print("Loding police data...")
    url = 'https://nihr.opendatasoft.com/api/records/1.0/search/?dataset=westminster-parliamentary-constituencies&rows=650'
    response = requests.get(url)
    records = response.json()['records']

    constituency_coords = get_constituency_coordinates()

    street_level_crimes, sl_last_updated = get_street_level_crimes(constituency_coords['Bethnal Green and Bow'], '2023-03', 'poly')
    no_loc_crimes = get_crimes_no_loc('metropolitan', '2023-03')
    stop_searches, ss_last_updated = get_stop_searches(constituency_coords['Bethnal Green and Bow'], '2023-03', 'poly')
    searches_no_loc = get_searches_no_loc('metropolitan', '2023-03')

    col_data = []
    col_data.append(get_unique_col_vals(street_level_crimes))
    col_data.append(get_unique_col_vals(no_loc_crimes))
    col_data.append(get_unique_col_vals(stop_searches))
    col_data.append(get_unique_col_vals(searches_no_loc))
    
    street_level_crimes = pd.json_normalize(street_level_crimes)
    street_level_crimes = street_level_crimes.drop(columns='persistent_id')
    
    no_loc_crimes = pd.json_normalize(no_loc_crimes)
    no_loc_crimes = no_loc_crimes.drop(columns='persistent_id')
    
    stop_searches = pd.json_normalize(stop_searches)
    
    searches_no_loc = pd.json_normalize(searches_no_loc)
    searches_no_loc = searches_no_loc.drop(columns='datetime')

    titles = ['Police Data - Street-level crimes', 'Police Data - Crimes with no location',
          'Police Data - Stop and searches by area', 'Police Data - Stop and searches with no location']
    descriptions = ['Crimes at street-level; either within a 1 mile radius of a single point, or within a custom area.',
                    'Returns a list of crimes that could not be mapped to a location.',
                    'Stop and searches at street-level; either within a 1 mile radius of a single point, or within a custom area.',
                    'Stop and searches that could not be mapped to a location.']
    latest_release = [sl_last_updated, sl_last_updated, ss_last_updated, ss_last_updated]
    
    f = open('data/police_long_descriptions.txt', 'r')
    long_desc = f.read().split('\n')
    
    cols = []
    for data in col_data:
        cols.append(list(data.keys()))

    police_df = pd.DataFrame({'Title': titles, 'Description': descriptions, 
                          'Long_description': long_desc, 'Columns': cols, 
                          'Unique_parameters': col_data, 'Latest_release': latest_release})


    """ NHS Quality and Outcomes """

    print("Loading NHS Quality and Outcomes...")
    NHS_quality_outcomes, long_description, latest_release = get_NHS_qualityOutcomes()

    titles = []
    for title in NHS_quality_outcomes.keys():
        titles.append(title.split(':')[1]) # Removing the text before : which is 'Table n:'
    
    num_tables = len(titles)
    descriptions = [''] * num_tables
    long_descriptions = [long_description] * num_tables
    last_rel = [latest_release] * num_tables
    
    cols = []
    unique_params = []
    for dataset in NHS_quality_outcomes.keys():
        sheet = NHS_quality_outcomes[dataset]
        temp_cols, temp_unqParams = get_qualityOutcomes_uniqueColumnValues(sheet)
        cols.append(temp_cols)
        unique_params.append(temp_unqParams)

    qualOutcomes_df = pd.DataFrame({'Title': titles, 'Description': descriptions, 
                                    'Long_description': long_descriptions, 'Columns': cols, 
                                    'Unique_parameters': unique_params, 'Latest_release': latest_release})

    
    metadata_df = pd.concat([ONS_df, nomis_df, insolvency_df, police_df, qualOutcomes_df]).reset_index(drop=True)
    
    if save_file:
        metadata_df.to_csv('data/metadata_dataset.csv', index=False)
        titles_ids = pd.DataFrame({"Title": metadata_df["Title"], "id": metadata_df.index})
        titles_ids.to_json('data/datasets_title_id.json')
    
    print("Complete")

    return metadata_df

In [None]:
# metadata_df = createMetadata(save_file=True)

Loading ONS...
Loading Nomis...
Loading insolvency stats
Loding police data...
Loading NHS Quality and Outcomes...
Complete


In [None]:
metadata_df

Unnamed: 0,Title,Description,Long_description,Columns,Unique_parameters,Latest_release
0,Quarterly personal well-being estimates,Seasonally and non seasonally-adjusted quarter...,We are currently reviewing the measures of nat...,"[v4_2, LCL, UCL, yyyy-qq, Time, uk-only, Geogr...","{'v4_2': None, 'LCL': None, 'UCL': None, 'yyyy...",2023-08-14T00:00:00.000Z
1,Personal well-being estimates by local authority,"Estimates of life satisfaction, feeling that t...",We are currently reviewing the measures of nat...,"[v4_3, Data marking, Lower limit, Upper limit,...","{'v4_3': None, 'Data marking': None, 'Lower li...",2022-10-31T00:00:00.000Z
2,Deaths registered weekly in England and Wales ...,Provisional counts of the number of deaths reg...,Quality and methodology information for mortal...,"[v4_1, Data Marking, calendar-years, Time, adm...","{'v4_1': None, 'Data Marking': None, 'calendar...",2023-08-30T00:00:00.000Z
3,Death registrations and occurrences by local a...,Provisional counts of the number of deaths reg...,Quality and methodology information for mortal...,"[v4_0, calendar-years, Time, administrative-ge...","{'v4_0': None, 'calendar-years': None, 'Time':...",2023-08-30T00:00:00.000Z
4,Death registrations and occurrences by health ...,Provisional counts of the number of deaths reg...,Quality and methodology information for mortal...,"[v4_0, calendar-years, Time, local-health-boar...","{'v4_0': None, 'calendar-years': None, 'Time':...",2023-08-30T00:00:00.000Z
...,...,...,...,...,...,...
1981,"Prevalence, achievement and personalised care...",,The objective of the Quality and Outcomes Fra...,"[Total Achievement Score (max 6), Total Denomi...","{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...",22/09/2022
1982,Achievement and personalised care adjustments...,,The objective of the Quality and Outcomes Fra...,"[Females aged 50-64, Total Denominators, Numbe...","{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...",22/09/2022
1983,Achievement and personalised care adjustments...,,The objective of the Quality and Outcomes Fra...,"[Achievement Score (max 10), List size ages 79...","{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...",22/09/2022
1984,"Achievement, quality improvement group, early...",,The objective of the Quality and Outcomes Fra...,"[Achievement Score (max 10), Achievement (%), ...","{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...",22/09/2022


In [None]:
pd.read_json("data/datasets_title_id.json")

Unnamed: 0,Title,id
0,Quarterly personal well-being estimates,0
1,Personal well-being estimates by local authority,1
2,Deaths registered weekly in England and Wales ...,2
3,Death registrations and occurrences by local a...,3
4,Death registrations and occurrences by health ...,4
...,...,...
1981,"Prevalence, achievement and personalised care...",1981
1982,Achievement and personalised care adjustments...,1982
1983,Achievement and personalised care adjustments...,1983
1984,"Achievement, quality improvement group, early...",1984


In [None]:
#| export
def prepare_identicalColData(metadata_df,         # Dataset
                             method="",           # Leave blank for the default or 'alt' for the alternative method
                             rare=True,           # Increases the weight given to rare connections
                             cutoff=5,            # The minimum weight of a connection to be included in the final file
                             filename="testing"): # The name given to the output file saved to the local data folder
    """
    Using the metadata this creates a dataset showing the connections between datasets 
    using the column data and formats it for use in the d3 network diagram. 
    Saves it to a file, filename and also returns the data as a dataframe.
    """
    cols_list = createColsList(metadata_df)
    scoredConnections = scoreConnections(metadata_df, cols_list, method=method, rare=rare)
    iColData_d3 = formatForD3(metadata_df.Title, scoredConnections, cutoff, True, filename) # True in this case just means the file is being saved which I'm assuming we want

    return iColData_d3

In [None]:
df = pd.read_csv('data/metadata_dataset.csv')
prepare_identicalColData(df, method='alt', cutoff=0, filename="identicalColumns_scoredByRarity_alt")

Unnamed: 0,Source,Target,weight
0,0,1,10.0
1,0,2,10.0
2,0,3,10.0
3,0,4,10.0
4,0,5,10.0
...,...,...,...
41562,1951,1954,10.0
41563,1953,1954,10.0
41564,1955,1956,10.0
41565,1955,1957,10.0


In [None]:
#| export
def generate_embeddings(df, api_key):
    """
    Takes the title, description, and long description, cleans them and combines into one long string.
    These long strings are then given to the openai text-embedding-ada-002 and used to generate an
    embedding. A formatted dataframe of embeddings is saved and returned.
    """

    inputs = df.iloc[:, :3]
    input_strings = {}
    for input in inputs.values:
        temp = [x for x in input if type(x) == str or not math.isnan(x)]
        temp = [str(item) for item in temp]
        input_strings[input[0]] = '|'.join(temp).replace("'", "").replace("\n", "").replace("\xa0", "")
    
    openai.api_key = api_key
    
    embeddings = {}
    # count = 0 # included to prevent wasting money using the api more times than necessary
    for item_name in input_strings.keys():
        if len(input_strings[item_name]) > 39300: # prevent exceeding token limit
            continue
    
        response = openai.Embedding.create(
            input=input_strings[item_name],
            model="text-embedding-ada-002"
        )
        embeddings[item_name] = (response['data'][0]['embedding'])
        
        # count += 1
        # if count >= 5:
        #     break

    embedding_df = pd.DataFrame.from_dict(embeddings)
    embedding_df = embedding_df.T.reset_index()
    embedding_df = embedding_df.rename(columns = {'index': 'Dataset_title'})
    embedding_df.to_csv('data/embeddings_df.csv', index=False)
    
    return embedding_df

In [None]:
df = pd.read_csv('data/metadata_dataset.csv')
# embedding_df = generate_embeddings(df, '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')

In [None]:
def prepare_embeddingData(embedding_df,           # Dataset
                          cutoff = 0.05,          # Value from 0-1 e.g. 0.05 would only save the top 5% of values
                          save = False,           # Do you want to save the dataframe True/False
                          filename = 'testing'):  # The name given to the file
    """
    Creating a dataframe to be used to make a network diagram.
    
    To do this we:
     - Go through every dataset
     - Get a list of the similarity between each dataset and every other dataset
    
    The dataframe has a source, target, and weight columns. The similarity between the source and 
    the target is the weight.

    The weights are normalised so the range is larger to fit more easily with the D3 code
    """
    network_df = pd.DataFrame(columns=['Source', 'Target', 'weight'])
    
    for i in range(len(embedding_df)):
        
        labels = list(embedding_df.iloc[:, 0].values)
        ls, s_values = cos_similarity(embedding_df.iloc[:, :-1], i, num=len(embedding_df)-1)
        
        source_labels = [labels[i]] * len(s_values)
        
        temp_df = pd.DataFrame({'Source': source_labels, 'Target': ls, 'weight': s_values})
        temp_df = temp_df.sort_values(by='weight', ascending=False)
        network_df = pd.concat([network_df, temp_df]).reset_index(drop=True)
    
    top5pct = network_df.sort_values(by='weight', ascending=False).iloc[:math.ceil(len(network_df)*cutoff), :].reset_index(drop=True)

    # Normalising the weights to between 0.2 and 10
    min_weight = top5pct['weight'].min()
    max_weight = top5pct['weight'].max()
    
    # Apply the linear transformation to normalize the 'weight' column
    top5pct.weight = ((top5pct.weight - min_weight) / (max_weight - min_weight)) * (10 - 0.2) + 0.2
    top5pct.weight = (top5pct.weight / 0.2).round() * 0.2
    
    top5pct = formatForD3(df.Title, top5pct, 0, save, filename)

    return top5pct

In [None]:
top5pct = prepare_embeddingData(embedding_df, save=True, filename='descriptionEmbeddings_top5pct')

In [None]:
top5pct

Unnamed: 0,Source,Target,weight
0,1944,1943,10.0
1,1943,1944,10.0
2,144,117,10.0
3,117,144,10.0
4,1979,1976,9.8
...,...,...,...
195125,1684,316,0.2
195126,311,330,0.2
195127,330,311,0.2
195128,1859,1162,0.2


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()