In [4]:
#Script 1:Generic Search  Through the Entire Project
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.cloud import bigquery

# Replace with your project ID and the search phrases
project_id = 'yhcr-prd-phm-bia-core'
search_phrases = ['0401010ADAAAAAA', '0401010ADAAAHAH']

# Initialize a BigQuery client
client = bigquery.Client(project=project_id)

# Function to search for the phrase or value in a project
def search_in_project(project_id, search_phrases):
    results = []
    datasets = list(client.list_datasets(project=project_id))

    def search_dataset(dataset, search_phrases):
        dataset_results = []
        dataset_id = dataset.dataset_id
        tables = list(client.list_tables(dataset_id))
        
        for table in tables:
            table_id = table.table_id
            table_ref = client.dataset(dataset_id).table(table_id)
            table = client.get_table(table_ref)
            columns = [field.name.lower() for field in table.schema]

            for search_phrase in search_phrases:
                # Check if the search phrase is a table name
                if table_id.lower() == search_phrase.lower():
                    dataset_results.append(f"Phrase '{search_phrase}' found as a table name in dataset {dataset_id} of project {project_id}.")
                    continue

                # Check if the search phrase is a column name
                if search_phrase.lower() in columns:
                    dataset_results.append(f"Phrase '{search_phrase}' found as a column name in dataset {dataset_id}.{table_id} of project {project_id}.")
                    continue

                # Check if the search phrase is a value in any column of the table
                column_checks = " OR ".join(
                    [f"CAST({field.name} AS STRING) LIKE '%{search_phrase}%'" for field in table.schema]
                )
                query = f"""
                SELECT COUNT(*)
                FROM `{project_id}.{dataset_id}.{table_id}`
                WHERE {column_checks}
                """
                try:
                    query_job = client.query(query)
                    result = query_job.result()
                    if list(result)[0][0] > 0:
                        dataset_results.append(f"Value '{search_phrase}' found in a column of dataset {dataset_id}.{table_id} of project {project_id}.")
                except Exception as e:
                    pass
        
        return dataset_results

    # Use ThreadPoolExecutor to search datasets in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(search_dataset, dataset, search_phrases) for dataset in datasets]
        for future in as_completed(futures):
            results.extend(future.result())

    if not results:
        results.append(f"No match found for any search phrase in the project {project_id}.")

    return results

# Search for the phrases and values in the project
results = search_in_project(project_id, search_phrases)
for result in results:
    print(result)




Value '0401010ADAAAAAA' found in a column of dataset CB_LOOKUPS.tbl_BNF_DMD_SNOMED_lkp of project yhcr-prd-phm-bia-core.
Value '0401010ADAAAHAH' found in a column of dataset CB_LOOKUPS.tbl_BNF_DMD_SNOMED_lkp of project yhcr-prd-phm-bia-core.


In [2]:
#Script 2: Search in Specific Datasets
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.cloud import bigquery

# Replace with your project ID and the search phrases
project_id = 'yhcr-prd-phm-bia-core'
dataset_ids = ['CB_FDM_Cardio', 'CB_FDM_Calderdale']  # Add the dataset IDs you want to search in
search_phrases = ['observation_period']

# Initialize a BigQuery client
client = bigquery.Client(project=project_id)

# Function to search for the phrase or value in specific datasets
def search_in_datasets(project_id, dataset_ids, search_phrases):
    results = []

    def search_table(dataset_id, table_id, search_phrases):
        table_results = []
        table_ref = client.dataset(dataset_id).table(table_id)
        table = client.get_table(table_ref)
        columns = [field.name.lower() for field in table.schema]

        for search_phrase in search_phrases:
            # Check if the search phrase is a table name
            if table_id.lower() == search_phrase.lower():
                table_results.append(f"Phrase '{search_phrase}' found as a table name in dataset {dataset_id} of project {project_id}.")
                continue

            # Check if the search phrase is a column name
            if search_phrase.lower() in columns:
                table_results.append(f"Phrase '{search_phrase}' found as a column name in dataset {dataset_id}.{table_id} of project {project_id}.")
                continue

            # Check if the search phrase is a value in any column of the table
            column_checks = " OR ".join(
                [f"CAST({field.name} AS STRING) LIKE '%{search_phrase}%'" for field in table.schema]
            )
            query = f"""
            SELECT COUNT(*)
            FROM `{project_id}.{dataset_id}.{table_id}`
            WHERE {column_checks}
            """
            try:
                query_job = client.query(query)
                result = query_job.result()
                if list(result)[0][0] > 0:
                    table_results.append(f"Value '{search_phrase}' found in a column of dataset {dataset_id}.{table_id} of project {project_id}.")
            except Exception as e:
                pass
        
        return table_results

    # Use ThreadPoolExecutor to search tables in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for dataset_id in dataset_ids:
            tables = list(client.list_tables(dataset_id))
            for table in tables:
                futures.append(executor.submit(search_table, dataset_id, table.table_id, search_phrases))

        for future in as_completed(futures):
            results.extend(future.result())

    if not results:
        results.append(f"No match found for any search phrase in the specified datasets of project {project_id}.")

    return results

# Search for the phrases and values in the specified datasets
results = search_in_datasets(project_id, dataset_ids, search_phrases)
for result in results:
    print(result)




Phrase 'observation_period' found as a table name in dataset CB_FDM_Cardio of project yhcr-prd-phm-bia-core.
Phrase 'observation_period' found as a table name in dataset CB_FDM_Calderdale of project yhcr-prd-phm-bia-core.


In [7]:
#Script 3:Search in Specific Tables
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.cloud import bigquery

# Replace with your project ID, dataset ID, and table IDs
project_id = 'yhcr-prd-phm-bia-core'
dataset_id = 'CB_FDM_Maternity'
table_ids = ['data_dictionary', 'cb_maternity_pathway_postnatal']  # Add the table IDs you want to search in
search_phrases = ['13517618'] #person_id

# Initialize a BigQuery client
client = bigquery.Client(project=project_id)

# Function to search for the phrase or value in a specific table
def search_in_table(project_id, dataset_id, table_id, search_phrases):
    results = []
    try:
        table_ref = client.dataset(dataset_id).table(table_id)
        table = client.get_table(table_ref)
        columns = [field.name.lower() for field in table.schema]

        for search_phrase in search_phrases:
            # Check if the search phrase is a column name
            if search_phrase.lower() in columns:
                results.append(f"Phrase '{search_phrase}' found as a column name in dataset {dataset_id}.{table_id} of project {project_id}.")
                continue

            # Check if the search phrase is a value in any column of the table
            column_checks = " OR ".join(
                [f"CAST({field.name} AS STRING) LIKE '%{search_phrase}%'" for field in table.schema]
            )
            query = f"""
            SELECT COUNT(*)
            FROM `{project_id}.{dataset_id}.{table_id}`
            WHERE {column_checks}
            """
            query_job = client.query(query)
            result = query_job.result()
            if list(result)[0][0] > 0:
                results.append(f"Value '{search_phrase}' found in a column of dataset {dataset_id}.{table_id} of project {project_id}.")
    except Exception as e:
        results.append(f"Error accessing table {dataset_id}.{table_id}: {str(e)}")

    return results

# Function to search for the phrase or value in specific tables
def search_in_tables(project_id, dataset_id, table_ids, search_phrases):
    results = []

    # Use ThreadPoolExecutor to search tables in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(search_in_table, project_id, dataset_id, table_id, search_phrases) for table_id in table_ids]
        for future in as_completed(futures):
            results.extend(future.result())

    if not results:
        results.append(f"No match found for any search phrase in the specified tables of dataset {dataset_id} of project {project_id}.")

    return results

# Search for the phrases and values in the specified tables
results = search_in_tables(project_id, dataset_id, table_ids, search_phrases)
for result in results:
    print(result)




Value '13517618' found in a column of dataset CB_FDM_Maternity.cb_maternity_pathway_postnatal of project yhcr-prd-phm-bia-core.


In [8]:
#Script 4:Search in Targeted Datasets and Tables
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.cloud import bigquery

# Replace with your project ID and the search phrases
project_id = 'yhcr-prd-phm-bia-core'
# List of tuples containing dataset ID and table ID pairs
dataset_table_pairs = [
    ('CB_LOOKUPS', 'tbl_CTV3ToSnomed_Map'),
    ('CB_FDM_YAS', 'cb_ObservationsFAST')
]
search_phrases = ['X50JY','F87D933909B3952060D101AB4463808C25496BEA24A00894EA5A903ED0395D78']

# Initialize a BigQuery client
client = bigquery.Client(project=project_id)

# Function to search for the phrase or value in a specific table
def search_in_table(project_id, dataset_id, table_id, search_phrases):
    results = []
    try:
        table_ref = client.dataset(dataset_id).table(table_id)
        table = client.get_table(table_ref)
        columns = [field.name.lower() for field in table.schema]

        for search_phrase in search_phrases:
            # Check if the search phrase is a column name
            if search_phrase.lower() in columns:
                results.append(f"Phrase '{search_phrase}' found as a column name in dataset {dataset_id}.{table_id} of project {project_id}.")
                continue

            # Check if the search phrase is a value in any column of the table
            column_checks = " OR ".join(
                [f"CAST({field.name} AS STRING) LIKE '%{search_phrase}%'" for field in table.schema]
            )
            query = f"""
            SELECT COUNT(*)
            FROM `{project_id}.{dataset_id}.{table_id}`
            WHERE {column_checks}
            """
            query_job = client.query(query)
            result = query_job.result()
            if list(result)[0][0] > 0:
                results.append(f"Value '{search_phrase}' found in a column of dataset {dataset_id}.{table_id} of project {project_id}.")
    except Exception as e:
        results.append(f"Error accessing table {dataset_id}.{table_id}: {str(e)}")

    return results

# Function to search for the phrase or value in specific tables across the project
def search_in_tables(project_id, dataset_table_pairs, search_phrases):
    results = []

    # Use ThreadPoolExecutor to search tables in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(search_in_table, project_id, dataset_id, table_id, search_phrases)
                   for dataset_id, table_id in dataset_table_pairs]
        for future in as_completed(futures):
            results.extend(future.result())

    if not results:
        results.append(f"No match found for any search phrase in the specified tables across the project {project_id}.")

    return results

# Search for the phrases and values in the specified tables
results = search_in_tables(project_id, dataset_table_pairs, search_phrases)
for result in results:
    print(result)





Value 'X50JY' found in a column of dataset CB_LOOKUPS.tbl_CTV3ToSnomed_Map of project yhcr-prd-phm-bia-core.
Value 'F87D933909B3952060D101AB4463808C25496BEA24A00894EA5A903ED0395D78' found in a column of dataset CB_FDM_YAS.cb_ObservationsFAST of project yhcr-prd-phm-bia-core.
