In [None]:
"""
Joinability Analysis using MinHash and Locality Sensitive Hashing (LSH)

This notebook demonstrates the use of MinHash and LSH for identifying joinable columns across multiple datasets. 
The analysis is performed on a collection of CSV files stored in the './somervillema' directory.

Overview of the Process:
1. Read each CSV file in the specified directory.
2. For each column in these CSV files, create a MinHash signature. 
3. Insert these MinHash signatures into an LSH index.
4. Perform a query using a specific column's MinHash to find similar columns in the collection of datasets.
5. Display the results, showing which columns from different datasets are potentially joinable with the query column.

Functions:
- create_minhash(column): Generates a MinHash signature for a given column from a DataFrame.

Key Variables:
- lsh: An instance of MinHashLSH, used for indexing MinHash signatures.
- minhashes: A dictionary that stores the MinHash signatures of each column.
- query_key: A tuple representing the dataset and column name used for querying the LSH index.

Query Execution:
- The query is executed for the 'Year' column in the '3ms3-ngki.csv' dataset.
- The LSH index is queried to find columns similar to this 'Year' column.
- Results are filtered to exclude the query column itself and then printed.

Note: The script handles EmptyDataError to skip over any CSV files that are empty or unreadable.
"""

In [59]:
import pandas as pd
from datasketch import MinHash, MinHashLSH
import os

In [60]:
def create_minhash(column):
    m = MinHash()
    for val in column:
        m.update(str(val).encode('utf8'))
    return m

In [75]:
lsh = MinHashLSH(threshold=0.1, num_perm=128)
minhashes={}
for filename in os.listdir('./somervillema'):
    if filename.endswith('.csv'):
        try:
            df = pd.read_csv(f'./somervillema/{filename}',header=0,dtype=str)
        except pd.errors.EmptyDataError:
            continue
        for column in df.columns:
            mh = create_minhash(df[column])
            minhashes[(filename,column)] = mh
            lsh.insert(f"{filename}_{column}",mh)


In [82]:
query_key = ('3ms3-ngki.csv', 'Year')

if query_key in minhashes:
    # Query the LSH for similar columns
    result = lsh.query(minhashes[query_key])

    # Filter out the query_key itself from the results
    filtered_result = [r for r in result if r != f"{query_key[0]}_{query_key[1]}"]

    # Print results
    print(f"Columns similar to {query_key}:")
    for res in filtered_result:
        print(res)
else:
    print(f"No MinHash found for {query_key}")

Columns similar to ('3ms3-ngki.csv', 'Year'):
6j4n-batb.csv_Year
qu9x-4xq5.csv_Year
3qxw-3aiy.csv_mmwr_year
bi8e-5vw8.csv_Year


In [79]:
# for key in minhashes:
#     print(key,minhashes[key])

In [83]:
query_key = ('3ms3-ngki.csv', 'Latitude')
# Check if the query key exists in the minhashes dictionary
if query_key in minhashes:
    # Query the LSH for similar columns
    result = lsh.query(minhashes[query_key])

    # Filter out the query_key itself from the results
    filtered_result = [r for r in result if r != f"{query_key[0]}_{query_key[1]}"]

    # Print results
    print(f"Columns similar to {query_key}:")
    for res in filtered_result:
        print(res)
else:
    print(f"No MinHash found for {query_key}")

Columns similar to ('3ms3-ngki.csv', 'Latitude'):
6j4n-batb.csv_Latitude
