# Matching Datasets Using Their Unique Columns / Parameters

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
import re
import ast

In [None]:
# Import the metadata
metadata_df = pd.read_csv('~/database_compendium/data/metadata_w_nomis_description.csv')

In [None]:
"""
Loop through the metadata dataframe and extract the columns and the headings
(keys) of the unique parameters. This ensures datasets with no data in the
'Columns' column still have the unique parameters column to go off.
"""
c = 0
faults = 0
cols_list = []
for row in metadata_df.loc[:, ['Columns', 'Unique_parameters']].values:
    temp_col_list = []
    # print(c)
    # c+=1

    # Column titles
    if type(row[0]) == str:
        temp_col_list += ast.literal_eval(row[0])
    else:
        temp_col_list.append(row[0])

    # Unique parameters
    if type(row[1]) == float:
        cols_list.append(temp_col_list)
        # print(str(row[1]))
        faults += 1
    else:
        try:
            temp_col_list += (list(ast.literal_eval(row[1]).keys()))
            cols_list.append(temp_col_list)
        except:
            cols_list.append(temp_col_list)
            faults += 1
            continue

# Removing extra text within brackets, converting to lowercase and removing duplicates
for i in range(len(cols_list)):
    for j in range(len(cols_list[i])):
        cols_list[i][j] = re.sub(r"\([^()]*\)", "", str(cols_list[i][j]))
        cols_list[i][j] = cols_list[i][j].lower()
    cols_list[i] = list(set(cols_list[i]))

In [None]:
cols_list[0]

['v4_2',
 'geography',
 'ucl',
 'yyyy-qq',
 'wellbeing-estimate',
 'lcl',
 'measureofwellbeing',
 'seasonal-adjustment',
 'measure-of-wellbeing',
 'seasonaladjustment',
 'time',
 'estimate',
 'uk-only']

## Metching Datasets by Identical Columns / Parameters

In [None]:
#| export
def find_identical_cols(cols_list, compare_idx):
    """
    Function to find the column titles of datasets that are identical to those
    in the dataset to be compared with.


    The index of the list must be used
    - To find the index of a named dataset we can simply search for the name in 
    the metadata_df and find its corresponding index
    """

    dataset_to_compare = compare_idx

    comp_cols = cols_list[dataset_to_compare]
    comp_cols = [s.strip() for s in comp_cols]
    updated_cols_list = cols_list[:]
    updated_cols_list.pop(dataset_to_compare)

    identical_cols = []
    pattern = re.compile(r"v\d+_\d+") # we want to remove strings in the form vX_Y
    for cl in updated_cols_list:

        # Clean list
        temp_cl = [s.strip() for s in cl]
        temp_cl = [s for s in temp_cl if not pattern.match(s)]

        # Add a list of strings that match from both comp_cols and temp_cl
        identical_cols.append(list(set(comp_cols) & set(temp_cl)))

    return identical_cols

In [None]:
# Finding the 10 datasets with the most identical columns to the dataset being compared
dataset_to_compare = 13
titles = metadata_df.loc[:, 'Title']
titles = list(titles.drop(dataset_to_compare))

identical_cols = find_identical_cols(cols_list, dataset_to_compare)
num_similarities = [len(l) for l in identical_cols]

idt_cols_df = pd.DataFrame({'Title': titles, 'Number_of_identical_columns': num_similarities, 'Identical_columns': identical_cols})
idt_cols_df.sort_values(by='Number_of_identical_columns', ascending=False).head(10)

Unnamed: 0,Title,Number_of_identical_columns,Identical_columns
12,Sexual orientation by English regions and UK c...,9,"[cv, sexualorientation, geography, sexual-orie..."
48,"Local authority ageing statistics, based on an...",8,"[geography, calendar-years, unit-of-measure, t..."
47,"Local authority ageing statistics, population ...",8,"[geography, calendar-years, unit-of-measure, t..."
27,UK Labour Market,8,"[geography, unit-of-measure, time, unitofmeasu..."
44,"Earnings and hours worked, age group by occupa...",8,"[cv, geography, calendar-years, time, sex, age..."
19,"Local authority ageing statistics, household p...",6,"[geography, calendar-years, time, sex, age-gro..."
5,Deaths registered weekly in England and Wales ...,6,"[geography, calendar-years, time, sex, age-gro..."
23,"Local authority ageing statistics, net interna...",6,"[geography, calendar-years, time, sex, age-gro..."
42,"Earnings and hours worked, care workers: ASHE ...",6,"[cv, geography, calendar-years, time, sex, uk-..."
40,"Earnings and hours worked, region by occupatio...",5,"[cv, geography, calendar-years, time, sex]"


## Using Fuzzy String Matching

In [None]:
#| hide
import textdistance

In [None]:
# def find_similar_cols(cols_list, compare_idx):


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()