<a href="https://colab.research.google.com/github/DeepHMS/Library-Explorer/blob/main/Library_Explorer_ipywidgets_v2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install --quiet ipywidgets
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import seaborn as sns
from google.colab import drive
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
from ipywidgets import Image
from ipywidgets import Image, HBox, VBox, HTML
import base64
import matplotlib.pyplot as plt
from PIL import Image as PILImage
import io

# Step 0: Add logo icon, tool name, and subtitle at the start

# Step 0: Add logo icon, tool name, and subtitle at the start
from ipywidgets import Image, HBox, VBox, HTML
import base64
import os

# Suppress Google Drive mount message if already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("(Google Drive already mounted)")

# Load the logo image and use HTML to display it
logo_path = '/content/drive/MyDrive/Library_Explorer/Library_Explorer_Icon_v2.png'
try:
    with open(logo_path, 'rb') as f:
        logo_bytes = f.read()  # Read as bytes
        logo_base64 = base64.b64encode(logo_bytes)  # Encode to base64 bytes
        logo_base64_str = logo_base64.decode('utf-8')  # Convert to string for HTML

    # Use HTML widget to display the logo (more reliable than ipywidgets.Image)
    logo_widget = HTML(
        value=f"<img src='data:image/png;base64,{logo_base64_str}' width='100' height='100' style='margin: 0 20px 0 0;' />"
    )

except FileNotFoundError:
    print(f"Error: Logo file not found at {logo_path}. Please ensure the file exists and the path is correct.")
    # Fallback: Display a placeholder text instead of the image
    logo_widget = HTML(
        value="<p style='width: 100px; height: 100px; margin: 0 20px 0 0; text-align: center; color: red;'>[Logo Missing]</p>"
    )
except Exception as e:
    print(f"Error loading logo: {str(e)}")
    # Fallback: Display a placeholder text instead of the image
    logo_widget = HTML(
        value="<p style='width: 100px; height: 100px; margin: 0 20px 0 0; text-align: center; color: red;'>[Logo Missing]</p>"
    )

# Tool name header
tool_name = HTML(
    value="<h1 style='font-size: 36px; font-weight: bold; text-align: center; margin: 0;'>Library Explorer</h1>"
)

# Subtitle
subtitle = HTML(
    value="<p style='font-size: 14px; text-align: center; margin: 0; color: #666;'>Library Explorer helps DIA proteomics users visualize, explore, and search large libraries.</p>"
)

# Combine logo, tool name, and subtitle in a horizontal layout
header_layout = HBox([logo_widget, VBox([tool_name, subtitle], layout={'align_items': 'center', 'width': '100%'})],
                     layout={'justify_content': 'center', 'margin': '20px 0'})


# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define function to create unique key
def create_unique_key(library):
    object_cols = ['ProductMz', 'Annotation', 'ProteinId', 'GeneName', 'ModifiedPeptideSequence', 'PrecursorCharge']
    for col in object_cols:
        if col in library.columns:
            if library[col].dtype == 'object':
                library.loc[:, col] = library[col].fillna('missing')
            else:
                library.loc[:, col] = library[col].fillna(-1)
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame.")
    library['Merged'] = library['PrecursorMz'].astype(str) + "_" + \
                       library['ProductMz'].astype(str) + "_" + \
                       library['Annotation'].astype(str) + "_" + \
                       library['ProteinId'].astype(str) + "_" + \
                       library['GeneName'].astype(str) + "_" + \
                       library['ModifiedPeptideSequence'].astype(str) + "_" + \
                       library['PrecursorCharge'].astype(str)
    return library['Merged'].nunique()

# Step 3: Define function for protein renaming
def query_and_rename_protein(library, query_protein_id, query_AllMappedProteins, renamed_protein_id, renamed_AllMappedProteins):
    total_rows_before = len(library)
    output = [f"Total rows before renaming: {total_rows_before}"]
    if 'ProteinId' in library.columns and 'AllMappedProteins' in library.columns:
        if query_protein_id in library['ProteinId'].values:
            rows_for_protein = len(library[library['ProteinId'] == query_protein_id])
            output.append(f"Rows with Protein ID '{query_protein_id}': {rows_for_protein}")
            library['ProteinId'] = library['ProteinId'].replace(query_protein_id, renamed_protein_id)
            library['AllMappedProteins'] = library['AllMappedProteins'].replace(query_AllMappedProteins, renamed_AllMappedProteins)
            total_rows_after = len(library)
            output.append(f"Protein ID '{query_protein_id}' has been renamed to '{renamed_protein_id}' in both 'ProteinId' and 'AllMappedProteins'.")
            output.append(f"Total rows after renaming: {total_rows_after}")
        else:
            output.append(f"Protein ID '{query_protein_id}' not found in the library.")
            output.append(f"Total rows (no change): {total_rows_before}")
    else:
        output.append("Error: Required columns 'ProteinId' or 'AllMappedProteins' not found.")
    return library, output

# Step 4: Create widgets for file input and header mapping
file_path_input = widgets.Text(
    value='',
    placeholder='e.g., /content/drive/MyDrive/1.Spectral_Library_Merging/03152025_Tryptic_Output_Library_Library.tsv',
    description='TSV File Path:',
    layout={'width': '600px'}
)

load_button = widgets.Button(
    description='Load File',
    button_style='primary',
    tooltip='Click to load the TSV file',
    layout={'width': '200px'}
)

# Widgets for header mapping (will be populated after file load)
header_mapping_widgets = {}
required_columns = [
    'ProductMz', 'Annotation', 'ProteinId', 'GeneName', 'ModifiedPeptideSequence',
    'PrecursorCharge', 'PeptideSequence', 'NormalizedRetentionTime',
    'PrecursorIonMobility', 'AllMappedProteins'
]

# Step 5: Create widgets for workflow and analysis inputs
workflow_dropdown = widgets.Dropdown(
    options=[
        ('1. Search Query Protein with Partial Uniprot ID', '1'),
        ('2. Search Peptide with Partial Peptide Sequence', '2'),
        ('3. Find Proteins against Query Peptide Sequence', '3'),
        ('4. Query Protein Renaming in the Library', '4')
    ],
    description='Workflow:',
    layout={'width': '600px'}
)

partial_id_input = widgets.Text(
    value='',
    placeholder='Enter partial UniProt ID or Peptide',
    description='Partial ID:',
    layout={'width': '400px'}
)

num_matches_input = widgets.IntText(
    value=5,
    description='Max Matches:',
    layout={'width': '200px'}
)

full_peptide_input = widgets.Text(
    value='',
    placeholder='Enter full peptide sequence',
    description='Peptide:',
    layout={'width': '400px'}
)

query_protein_id_input = widgets.Text(
    value='',
    placeholder='Enter query UniProt ID',
    description='Query Protein ID:',
    layout={'width': '400px'}
)

renamed_protein_id_input = widgets.Text(
    value='',
    placeholder='Enter new Protein ID',
    description='New Protein ID:',
    layout={'width': '400px'}
)

query_mapped_proteins_input = widgets.Text(
    value='',
    placeholder='Enter query AllMappedProteins',
    description='Query Mapped:',
    layout={'width': '400px'}
)

renamed_mapped_proteins_input = widgets.Text(
    value='',
    placeholder='Enter new AllMappedProteins',
    description='New Mapped:',
    layout={'width': '400px'}
)

run_button = widgets.Button(
    description='Run Analysis',
    button_style='success',
    tooltip='Click to run the selected workflow',
    layout={'width': '200px'}
)

output_area = widgets.Output()

# Step 6: Function to update input visibility based on workflow
def update_inputs(change):
    with output_area:
        # Do not clear the output, just update visibility
        partial_id_input.layout.display = 'none'
        num_matches_input.layout.display = 'none'
        full_peptide_input.layout.display = 'none'
        query_protein_id_input.layout.display = 'none'
        renamed_protein_id_input.layout.display = 'none'
        query_mapped_proteins_input.layout.display = 'none'
        renamed_mapped_proteins_input.layout.display = 'none'

        if workflow_dropdown.value in ['1', '2']:
            partial_id_input.layout.display = 'flex'
            num_matches_input.layout.display = 'flex'
        elif workflow_dropdown.value == '3':
            full_peptide_input.layout.display = 'flex'
        elif workflow_dropdown.value == '4':
            query_protein_id_input.layout.display = 'flex'
            renamed_protein_id_input.layout.display = 'flex'
            query_mapped_proteins_input.layout.display = 'flex'
            renamed_mapped_proteins_input.layout.display = 'flex'

workflow_dropdown.observe(update_inputs, names='value')

# Step 7: Function to handle file loading and header mapping
def on_load_button_clicked(b):
    global library
    with output_area:
        clear_output()
        if not os.path.exists(file_path_input.value):
            print(f"Error: File not found at {file_path_input.value}")
            return

        try:
            # Display progress bar for file loading
            with tqdm(total=1, desc="Loading TSV file", leave=True) as pbar:
                library = pd.read_csv(file_path_input.value, sep='\t')
                pbar.update(1)  # Complete the progress bar
            print("Columns in the file:")
            print(library.columns.tolist())

            # Create dropdowns for header mapping with shorter labels and comments
            header_mapping_widgets.clear()
            column_options = ['None'] + library.columns.tolist()

            # Define comments for each header
            header_comments = {
                'ProductMz': "The m/z value of the product ion",
                'Annotation': "Annotation or label for the spectrum",
                'ProteinId': "Unique identifier for the protein (e.g., UniProt ID)",
                'GeneName': "Name of the gene associated with the protein",
                'ModifiedPeptideSequence': "Peptide sequence with modifications",
                'PrecursorCharge': "Charge state of the precursor ion",
                'PeptideSequence': "Amino acid sequence of the peptide",
                'NormalizedRetentionTime': "Normalized retention time of the peptide",
                'PrecursorIonMobility': "Ion mobility value of the precursor",
                'AllMappedProteins': "List of all proteins mapped to this proteins"
            }

            # Create a list to hold the HBox widgets (dropdown + comment)
            header_widgets_with_comments = []

            for col in required_columns:
                dropdown = widgets.Dropdown(
                    options=column_options,
                    description=f'{col}:',
                    value='None',
                    layout={'width': '500px'}
                )
                comment = widgets.HTML(
                    value=f"<i>{header_comments.get(col, 'No description available')}</i>",
                    layout={'margin': '0 0 0 10px'}  # Add some margin to separate from dropdown
                )
                header_mapping_widgets[col] = dropdown
                header_widgets_with_comments.append(widgets.HBox([dropdown, comment]))

            # Display header mapping widgets and confirm button
            confirm_button = widgets.Button(
                description='Confirm Headers',
                button_style='info',
                tooltip='Click to confirm column mappings',
                layout={'width': '200px'}
            )

            def on_confirm_button_clicked(b):
                with output_area:
                    clear_output()
                    global library
                    # Rename columns based on user mappings
                    rename_dict = {header_mapping_widgets[col].value: col
                                 for col in required_columns
                                 if header_mapping_widgets[col].value != 'None'}
                    library = library.rename(columns=rename_dict)
                    print("Columns after mapping:")
                    print(library.columns.tolist())

                    # Proceed with analysis
                    perform_analysis(library)

            confirm_button.on_click(on_confirm_button_clicked)
            display(widgets.VBox(header_widgets_with_comments + [confirm_button]))

        except Exception as e:
            print(f"Error loading file: {str(e)}")

load_button.on_click(on_load_button_clicked)

# Step 8: Function to perform analysis
def perform_analysis(library):
    df = library.copy()
    number_of_unique_keys = create_unique_key(df)

    total_rows = library.shape[0]
    unique_ids = library['ProteinId'].dropna().unique() if 'ProteinId' in library.columns else []
    unique_peptide = library['PeptideSequence'].dropna().unique() if 'PeptideSequence' in library.columns else []
    mod_peptide = library['ModifiedPeptideSequence'].dropna().unique() if 'ModifiedPeptideSequence' in library.columns else []
    norm_rt = library['NormalizedRetentionTime'].dropna().unique() if 'NormalizedRetentionTime' in library.columns else []
    ion_mob = library['PrecursorIonMobility'].dropna().unique() if 'PrecursorIonMobility' in library.columns else []

    print("Total number of rows in the TSV file:", total_rows)
    print("Number of unique keys:", number_of_unique_keys)
    print("Number of unique Protein IDs:", len(unique_ids))
    print("Number of unique Peptide:", len(unique_peptide))
    print("Number of Modified Peptide:", len(mod_peptide))
    print("Number of Normalized Retention Time:", len(norm_rt))
    print("Number of Ion Mobility:", len(ion_mob))
    print()
    print("Minimum Normalized Retention Time:", min(norm_rt) if len(norm_rt) > 0 else "N/A")
    print("Maximum Normalized Retention Time:", max(norm_rt) if len(norm_rt) > 0 else "N/A")
    print()
    print("Minimum Ion Mobility:", min(ion_mob) if len(ion_mob) > 0 else "N/A")
    print("Maximum Ion Mobility:", max(ion_mob) if len(ion_mob) > 0 else "N/A")

    # Check if both columns exist to decide subplot arrangement
    if 'NormalizedRetentionTime' in library.columns and 'PrecursorIonMobility' in library.columns:
        # Create a figure with two subplots side by side
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))  # 1 row, 2 columns

        # Plot Normalized Retention Time on the left
        ax1.hist(df['NormalizedRetentionTime'], bins=50, color='skyblue', edgecolor='black')
        ax1.set_title('Normalized Retention Times')
        ax1.set_xlabel('Normalized Retention Time')
        ax1.set_ylabel('Frequency')
        ax1.grid(False)

        # Plot Precursor Ion Mobility on the right
        ax2.hist(df['PrecursorIonMobility'], bins=50, color='orange', edgecolor='black')
        ax2.set_title('Ion Mobility')
        ax2.set_xlabel('Ion Mobility')
        ax2.set_ylabel('Frequency')
        ax2.grid(False)

        plt.tight_layout()  # Adjust layout to prevent overlap
        plt.show()
    else:
        # Fallback if one of the columns is missing
        if 'NormalizedRetentionTime' in library.columns:
            plt.figure(figsize=(8, 4))
            plt.hist(df['NormalizedRetentionTime'], bins=50, color='skyblue', edgecolor='black')
            plt.title('Normalized Retention Times')
            plt.xlabel('Normalized Retention Time')
            plt.ylabel('Frequency')
            plt.grid(False)
            plt.show()
        else:
            print("Warning: 'NormalizedRetentionTime' column not found for histogram.")

        if 'PrecursorIonMobility' in library.columns:
            plt.figure(figsize=(8, 4))
            plt.hist(df['PrecursorIonMobility'], bins=50, color='orange', edgecolor='black')
            plt.title('Ion Mobility')
            plt.xlabel('Ion Mobility')
            plt.ylabel('Frequency')
            plt.grid(False)
            plt.show()
        else:
            print("Warning: 'PrecursorIonMobility' column not found for histogram.")

    data = {
        "Total rows": total_rows,
        "Unique keys": number_of_unique_keys,
        "Unique Protein IDs": len(unique_ids),
        "Unique Peptides": len(unique_peptide),
        "Unique Mod Peptides": len(mod_peptide),
        "Norm RT": len(norm_rt),
        "No. Ion Mobility": len(ion_mob)
    }
    labels = list(data.keys())
    values = list(data.values())
    figsize = (14, 2)
    cell_text_size = 10
    title_size = 12
    fig, ax = plt.subplots(figsize=figsize)
    ax.axis('tight')
    ax.axis('off')
    the_table = ax.table(cellText=[values],
                        colLabels=labels,
                        colColours=["palegreen"]*len(values),
                        cellLoc='center',
                        loc='center')
    the_table.auto_set_font_size(False)
    the_table.set_fontsize(cell_text_size)
    the_table.scale(1, 2.2)
    plt.title('Summary Statistics Table', fontsize=title_size)
    plt.show()

    # No need to display workflow widgets here; they are already displayed initially

# Step 9: Function to handle run button click
def on_run_button_clicked(b):
    with output_area:
        # Do not clear the entire output, just append new results
        print("\n--- Running Workflow ---")
        global library
        try:
            if workflow_dropdown.value == '1':
                if 'ProteinId' in library.columns and 'AllMappedProteins' in library.columns:
                    partial_id = partial_id_input.value.upper()
                    num_matches = num_matches_input.value
                    matching_ids = library[library['ProteinId'].str.upper().str.startswith(partial_id)]
                    if len(matching_ids) > 0:
                        unique_ids = matching_ids['ProteinId'].unique()
                        print("Matching UniProt IDs:")
                        print(unique_ids)
                        print("\nMatching Protein IDs and their Mapped Proteins:")
                        displayed_ids = unique_ids[:num_matches]
                        for protein_id in displayed_ids:
                            mapped_protein = matching_ids[matching_ids['ProteinId'] == protein_id]['AllMappedProteins'].iloc[0]
                            print(f"{protein_id} = {mapped_protein}")
                    else:
                        print("No matching proteins found with that UniProt ID prefix.")
                else:
                    print("Error: Required columns 'ProteinId' or 'AllMappedProteins' not found.")

            elif workflow_dropdown.value == '2':
                if 'PeptideSequence' in library.columns and 'ProteinId' in library.columns:
                    partial_id = partial_id_input.value.upper()
                    num_matches = num_matches_input.value
                    matching_peptides = library[library['PeptideSequence'].str.contains(partial_id, case=False, na=False)]
                    print(f"Found {len(matching_peptides)} matching entries out of {len(library)} total entries.")
                    if len(matching_peptides) == 0:
                        print("No matching peptides found.")
                    else:
                        print("Matching Peptides and Protein IDs:")
                        for index, row in matching_peptides.head(num_matches).iterrows():
                            print(f"{row['PeptideSequence']} - {row['ProteinId']}")
                else:
                    print("Error: Required columns 'PeptideSequence' or 'ProteinId' not found.")

            elif workflow_dropdown.value == '3':
                if 'PeptideSequence' in library.columns and 'ProteinId' in library.columns:
                    full_peptide = full_peptide_input.value
                    matching_entries = library[library['PeptideSequence'] == full_peptide]
                    unique_proteins = matching_entries['ProteinId'].unique()
                    print(f"Number of unique proteins found: {len(unique_proteins)}")
                    if len(unique_proteins) > 0:
                        print("Protein IDs:")
                        for protein in unique_proteins:
                            print(protein)
                    else:
                        print("No proteins found for the given peptide.")
                else:
                    print("Error: Required columns 'PeptideSequence' or 'ProteinId' not found.")

            elif workflow_dropdown.value == '4':
                if 'ProteinId' in library.columns and 'AllMappedProteins' in library.columns:
                    library, rename_output = query_and_rename_protein(
                        library,
                        query_protein_id_input.value,
                        query_mapped_proteins_input.value,
                        renamed_protein_id_input.value,
                        renamed_mapped_proteins_input.value
                    )
                    for line in rename_output:
                        print(line)
                else:
                    print("Error: Required columns 'ProteinId' or 'AllMappedProteins' not found.")

        except Exception as e:
            print(f"Error during analysis: {str(e)}")

run_button.on_click(on_run_button_clicked)

# Step 10: Display initial widgets with workflow at the top

display(widgets.VBox([
    header_layout,
    workflow_dropdown,
    partial_id_input,
    num_matches_input,
    full_peptide_input,
    query_protein_id_input,
    renamed_protein_id_input,
    query_mapped_proteins_input,
    renamed_mapped_proteins_input,
    file_path_input,
    load_button,
    run_button,
    output_area
]))

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.6 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.5/1.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


VBox(children=(HBox(children=(HTML(value="<img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABqQAAAaYCAYA…