In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
# For Analysis
import ipywidgets as widgets
from IPython.display import display, clear_output

# Aquiring Data

This will aquire the data from Drive once fully developed. Currently getting it from the data folder that is being ignored in Github. Right now, we are only getting it from folders Hits and Query, not Assembly.

In [None]:
#This will have to change after we set it up to pull from Drive
datapath =  os.getcwd() + "/data/"

In [None]:
# Getting our dataframe. Currently hardcoded with the name of the csv.
query = pd.read_csv(datapath + "query.csv")
hits = pd.read_csv(datapath + "hits.csv")

# Cleaning Data

This section cleans the data and also shows exactly what was dropped/manipulated.

In [None]:
# These functions are for later exception raising/preliminary analysis of the actual csv structure.
# They are mainly for cleaning the data, analysis functions will be later/in a different block.
def compare_columns_sets(df1,df2):
    set1_c, set2_c = set(df1.columns), set(df2.columns)
    # "a - b" removes the items in a that it shares with b. (just look up set theory)
    df1_dif = set1_c - set2_c
    df2_dif = set2_c - set1_c
    return df1_dif, df2_dif

def drop_nan_columns(df):
    dropped_df = df.dropna(axis=1,how='all')
    return dropped_df

def unusual_row_mask(df, col, threshold=0.5):
    """Return a Boolean mask for rows where the column has values,
    but the column is mostly NaN based on the threshold."""
    if df[col].isna().mean() >= threshold:
        return df[col].notna()
    return pd.Series([False] * len(df), index=df.index)

def get_peculiar_columns(df,threshold=0.5):
    return (df.isnull().sum() / df.shape[0])[df.isnull().sum() / df.shape[0] > threshold]

def rows_for_peculiar_columns(df,threshold=0.5):
    masks = {}
    p_series = get_peculiar_columns(df,threshold)
    for p_col in p_series.index:
        mask = unusual_row_mask(df, p_col, threshold)
        if mask.any():  # Only keep masks that select at least one row. A dictionary
            masks[p_col] = mask
    return masks

def split_query_name(row,splitting_column='Name'):
    name = row[splitting_column]
    name = name.split()
    for item in name:
        if "=" in item:
            item = item.split("=")
            if item[1] == '' or item[1] == ' ' or item[1] == []:
                item[1] = np.nan
            row[item[0]] = item[1]
    return row

def compare_columns_rowwise(df, col1, col2):
    """
    Compare two columns in a DataFrame row-wise.
    Returns:
        -1 if all values are the same in every row,
        otherwise, a tuple of row indices where values differ.
    """
    mask = df[col1] != df[col2]
    diff_indices = tuple(df.index[mask])
    return -1 if not diff_indices else diff_indices

def combine_col1_into_col2(df, col1, col2):
    """
    Combine values from col1 into col2 if col2 is NaN.
    If col2 is not NaN, it will keep the original value.
    """
    df.loc[:,col2] = df.loc[:,col1]
    return df.drop(columns=[col1], axis=1)  # Drops col1 after combining


### Immediate Cleanup

Some of the data has all NaN values for certain columns, so they need to be cleaned up. Also, some qualities of the unclean data are shown.

In [None]:
print(len(query.columns), "columns in uncleaned query")
print(len(hits.columns), "columns in uncleaned hits")

In [None]:
query.columns

In [None]:
hits.columns

In [None]:
# Drops missing columns and combines same columns.
dropped_query = drop_nan_columns(query)
dropped_hits = drop_nan_columns(hits)
# Error check. Make Better Later

print("For dropped query", compare_columns_sets(query,dropped_query))

print("For dropped hits", compare_columns_sets(hits,dropped_hits))

if compare_columns_rowwise(dropped_query, 'Created Date', 'Created') != -1:
     raise ValueError(compare_columns_rowwise(dropped_query, 'Created Date', 'Created'), "are the rows where Created Date and Created differ.")

if compare_columns_rowwise(dropped_hits, 'Created Date', 'Created') != -1:
    raise ValueError(compare_columns_rowwise(dropped_hits, 'Created Date', 'Created'), "are the rows where Created Date and Created differ.")

# Combines Created Date into Created
dropped_query = combine_col1_into_col2(dropped_query, 'Created Date', 'Created')
dropped_hits = combine_col1_into_col2(dropped_hits, 'Created Date', 'Created')
print("Merged Created Date into Created in both query and hits.")

### Metadata Cleanup

The exportation from Geneious Prime has extra data in Name and Query from the sequences in the folders Query and Hits respectively. These will enable us to partially link them together later in analysis.

This next block shows exactly what we have to split up in our data, as Geneious prime put more data inside certain cells than others.

In [None]:
# Shows the peculiar columns in the query and hits.
print("There's metadata in certain cells.")
# Arbitrary row chosen to demonstrate the metadata.
query_val = dropped_hits.loc[10,["Query"]]
pd.set_option('display.max_colwidth', None)
print("From the Query column in hits:\n", query_val)
name_val = dropped_query.loc[20,["Name"]]
print("\nFrom the Name column in query:\n", name_val)

Yet, for our queries that were contigs, the "Name" does not have the extra data (as it is just named "Contig #", so we lose a lot of extra data). We need to split the query dataframe.

In [None]:
# Splits the query table into the contigs and noncontigs. + a TODO
# TODO May need to do equals() to see if each column really does correspond to a contig.
query_masks = rows_for_peculiar_columns(dropped_query)
print("These keys are what contigs but not regular sequences have:\n", query_masks.keys())
contig_query = dropped_query[query_masks['# Source Sequences']]
noncontig_query = drop_nan_columns(dropped_query[~query_masks['# Source Sequences']])

Some of the extra data, doesn't actually have extra data, so that also needs to be cleaned up.

In [None]:
# Splits our data up and gets the weird cell split up. Also drops sample_id as there is none.
clean_contig_query = contig_query.apply(split_query_name, axis=1) # Nothing happens as Name is just Contig #.
clean_noncontig_query = drop_nan_columns(noncontig_query.apply(split_query_name, axis=1))
clean_hits = drop_nan_columns(dropped_hits.apply(split_query_name, axis=1, splitting_column="Query"))

In [None]:
# Shows unique columns in contigs and non-contigs queries.
in_contig, not_in_contig = compare_columns_sets(clean_contig_query, clean_noncontig_query)
print("This is what is uniquely inside contigs:\n", in_contig, "\nThis is what is uniquely in non-contigs:\n", not_in_contig)

In [None]:
# Combine 'barcode' and 'barcode_alias' into 'barcode' 
clean_noncontig_query = combine_col1_into_col2(clean_noncontig_query, 'barcode_alias', 'barcode')
clean_hits = combine_col1_into_col2(clean_hits, 'barcode_alias', 'barcode')

In [None]:
# Cleaning clean_hits where they are contigs.
# This is where we will add the barcode and protocol_group_id.
# We will also fill in the parent_read_id with the Query column.

clean_hits[['barcode', 'protocol_group_id']] = clean_hits[['barcode', 'protocol_group_id']].bfill().ffill()
clean_hits['parent_read_id'] = clean_hits['parent_read_id'].fillna(clean_hits['Query'])

Final look at what the dataframe looks like along with final bug tests before analysis.

In [None]:
# Columns in c_hits
c_hits_val = clean_hits.loc[1, :]
pd.set_option('display.max_colwidth', None)
print("From a row in c_hits:\n", c_hits_val)

In [None]:
# Columns in c_contig_query
c_contig_val = clean_contig_query.iloc[3,:]
pd.set_option('display.max_colwidth', None)
print("Columns (+ examples) in c_contig_query:\n")
print("From a row in c_hits:\n", c_contig_val)

In [None]:
## TODO, make tests of data connections.
#assert all(hits.columns == query.columns), "Columns in hits and query do not match"
#assert all(hits.columns == query.columns), "Columns in hits and query do not match"

In [None]:
## TODO, analysis of filtered workflow vs unfiltered workflow.
#filtered_hits = pd.read_csv(datapath + "barcode13-filtered-hits.csv")
#filtered_hits = filtered_hits.apply(split_query_name, axis=1)
#filtered_hits.columns
#hits[hits["parent_read_id"] != filtered_hits["parent_read_id"]]

# Analysis

The data has been cleaned up. Now it is time to use that to see the details about the run

In [None]:
## Useful columns in clean_hits (so we can change later):
USEFUL_COLS = [
    'Name',
    'parent_read_id',
    'Accession',
    'Grade',
    'E Value',
    'Bit-Score',
    '% Identical Sites',
    '% Pairwise Identity',
    'barcode',
    'Sequence Length',
]
organism_percent = clean_hits['Organism'].value_counts(normalize=True) * 100

In [None]:
# Widgets for analysis
# Look at https://ipywidgets.readthedocs.io/en/7.7.1/examples/Widget%20Basics.html for more info on widgets.
## Dropdown for organism selection
organism_dropdown = widgets.Dropdown(
    options=list(organism_percent.index),
    value=organism_percent.index[0],
    description='Organism:',
    disabled=False,
    continuous_update=False
)
## IntText for number of results to show
num_results_input = widgets.BoundedIntText(
    value=3,
    min=1,
    max=len(clean_hits),
    step=1,
    description='Top Grade #:',
    disabled=False,
    continuous_update=False
)
## Checkbox for ascending grade order
ascending_input = widgets.Checkbox(
    value=False,
    description='Ascending Grade Order',
    disabled=False,
    continuous_update=False
)


In [None]:
# Functions for Analysis
## Output for showing results based on Grade from Organism
output_organism = widgets.Output(layout={'border': '1px solid black'})
def show_top_hits_dropdown(organism_name, n_results, ascending=False):
    filtered = clean_hits[clean_hits['Organism'] == organism_name]
    try:
        filtered = filtered.copy()
        filtered['Grade_numeric'] = filtered['Grade'].str.rstrip('%').astype(float)
        filtered = filtered.sort_values('Grade_numeric', ascending=ascending)
    except Exception:
        filtered = filtered.sort_values('Grade', ascending=ascending)
    display(filtered.loc[:,USEFUL_COLS].head(n_results))

# Updates output based on dropdown change
def on_dropdown_change(change):
    with output_organism:
        clear_output()
        show_top_hits_dropdown(organism_dropdown.value, num_results_input.value, ascending_input.value)
# Update the output when the num value changes
def on_num_results_change(change):
    with output_organism:
        clear_output()
        show_top_hits_dropdown(organism_dropdown.value, num_results_input.value, ascending_input.value)
# Update the output when the ascending value changes
def on_ascending_input_change(change):
    with output_organism:
        clear_output()
        show_top_hits_dropdown(organism_dropdown.value, num_results_input.value, ascending_input.value)

### Number of Unique Organisms Found

In [None]:
# Initial display of top hits for the most common organism
for organism, percent in organism_percent.items():
    print(f"{organism} - {percent:.2f} % - {(clean_hits['Organism'] == organism).sum()} hits")

In [None]:
# Displays the interactable widget.
organism_dropdown.observe(on_dropdown_change, names='value')
num_results_input.observe(on_num_results_change, names='value')
ascending_input.observe(on_ascending_input_change, names='value')

display(organism_dropdown, num_results_input, ascending_input, output_organism)

In [None]:
# Quick and Dirty Identified vs Nonidentified.
print(f"{clean_hits.shape[0] / ( clean_contig_query.shape[0] + clean_noncontig_query.shape[0]) * 100:.2f}% of sequences were identified.")