In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Peptide3D Mapper - Interactive Peptide to 3D Structure Mapping Tool\n",
    "# Install required packages (run once)\n",
    "!pip install py3dmol biopython requests ipywidgets matplotlib pandas scipy -q\n",
    "\n",
    "import pandas as pd\n",
    "import math\n",
    "import requests\n",
    "from Bio import SeqIO\n",
    "import io\n",
    "from matplotlib import cm, colors\n",
    "import py3Dmol\n",
    "from google.colab import files\n",
    "import ipywidgets as widgets\n",
    "from IPython.display import display, clear_output, Javascript\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.patches as patches\n",
    "from scipy.stats import zscore\n",
    "from base64 import b64encode\n",
    "\n",
    "print(\"Welcome to Peptide3D Mapper! Upload your CSV and FASTA files to begin.\")\n",
    "\n",
    "# File Upload Widgets\n",
    "csv_upload = widgets.FileUpload(accept='.csv', multiple=False, description='Upload CSV')\n",
    "fasta_upload = widgets.FileUpload(accept='.fasta', multiple=False, description='Upload FASTA')\n",
    "\n",
    "# Other Widgets\n",
    "species_input = widgets.Text(value='Human', placeholder='Enter species (Human or Mouse)', description='Species:')\n",
    "search_input = widgets.Text(value='', placeholder='Search UniProt ID (partial)', description='Search:')\n",
    "protein_dropdown = widgets.Dropdown(options=[], description='Select Protein:')\n",
    "scale_dropdown = widgets.Dropdown(options=['log', 'z-score'], value='log', description='Scaling:')\n",
    "overlap_dropdown = widgets.Dropdown(options=['none', 'merge', 'highest'], value='none', description='Overlap Strategy:')\n",
    "combine_isoforms = widgets.Checkbox(value=True, description='Combine Isoforms')\n",
    "isoform_select = widgets.Text(value='', placeholder='Enter isoforms (comma-separated, e.g., O00560,O00560-2)', description='Isoforms:', disabled=True)\n",
    "bg_dropdown = widgets.Dropdown(options=['black', 'white', 'darkgrey'], value='white', description='Background:')\n",
    "download_btn = widgets.Button(description=\"Download Files\")\n",
    "output = widgets.Output()\n",
    "\n",
    "# Enable/disable isoform text input based on checkbox\n",
    "def on_combine_change(change):\n",
    "    isoform_select.disabled = change['new']\n",
    "\n",
    "combine_isoforms.observe(on_combine_change, names='value')\n",
    "\n",
    "# Update protein dropdown based on search and uploaded CSV\n",
    "def update_proteins(*args):\n",
    "    if csv_upload.value:\n",
    "        csv_content = csv_upload.value[0]['content']\n",
    "        df = pd.read_csv(io.StringIO(csv_content.decode('utf-8')))\n",
    "        all_proteins = sorted(df['Protein.Group'].unique())\n",
    "        search_term = search_input.value.lower()\n",
    "        filtered = [p for p in all_proteins if search_term in p.lower()]\n",
    "        protein_dropdown.options = sorted(filtered) if filtered else all_proteins\n",
    "\n",
    "search_input.observe(update_proteins, 'value')\n",
    "csv_upload.observe(update_proteins, names='value')\n",
    "\n",
    "# Global variables for download\n",
    "global_peptides_df = None\n",
    "global_pdb_str = None\n",
    "global_fig_lin = None\n",
    "residue_scaled = None\n",
    "min_val = None\n",
    "max_val = None\n",
    "cmap = None\n",
    "\n",
    "# Function to run analysis\n",
    "def run_analysis(change=None):\n",
    "    global global_peptides_df, global_pdb_str, global_fig_lin, residue_scaled, min_val, max_val, cmap\n",
    "    with output:\n",
    "        clear_output(wait=True)\n",
    "        \n",
    "        # Check uploads\n",
    "        if not csv_upload.value or not fasta_upload.value:\n",
    "            print(\"Please upload both CSV and FASTA files.\")\n",
    "            return\n",
    "        \n",
    "        # Parse uploaded files\n",
    "        csv_content = csv_upload.value[0]['content']\n",
    "        df = pd.read_csv(io.StringIO(csv_content.decode('utf-8')))\n",
    "        fasta_content = fasta_upload.value[0]['content']\n",
    "        seq_records = list(SeqIO.parse(io.StringIO(fasta_content.decode('utf-8')), \"fasta-pearson\"))\n",
    "        protein_of_interest = protein_dropdown.value\n",
    "        scale_type = scale_dropdown.value\n",
    "        overlap_strategy = overlap_dropdown.value\n",
    "        combine = combine_isoforms.value\n",
    "        isoforms_input = isoform_select.value\n",
    "        bg_color = bg_dropdown.value\n",
    "        species = species_input.value\n",
    "        \n",
    "        # Find the matching sequence\n",
    "        sequence = None\n",
    "        for record in seq_records:\n",
    "            if protein_of_interest in record.id:\n",
    "                sequence = str(record.seq)\n",
    "                print(f\"Found matching sequence for {record.id}\")\n",
    "                break\n",
    "\n",
    "        if sequence is None:\n",
    "            print(f\"No matching sequence found in FASTA for {protein_of_interest}.\")\n",
    "            return\n",
    "\n",
    "        seq_len = len(sequence)\n",
    "\n",
    "        # Find matching isoforms\n",
    "        isoforms = df[df['Protein.Group'].str.contains(protein_of_interest + r'(-\d+)?$', na=False)]['Protein.Group'].unique()\n",
    "        print(f\"Found isoforms: {list(isoforms)}\")\n",
    "\n",
    "        if len(isoforms) > 1 and not combine:\n",
    "            if isoforms_input.strip():\n",
    "                selected_groups = [s.strip() for s in isoforms_input.split(',')]\n",
    "                if not all(s in isoforms for s in selected_groups):\n",
    "                    print(f\"Invalid isoforms selected. Using all: {list(isoforms)}\")\n",
    "                    selected_groups = isoforms\n",
    "            else:\n",
    "                print(\"No isoforms specified. Using all: {list(isoforms)}\")\n",
    "                selected_groups = isoforms\n",
    "        else:\n",
    "            selected_groups = isoforms\n",
    "\n",
    "        # Filter dataframe\n",
    "        selected_df = df[df['Protein.Group'].isin(selected_groups)]\n",
    "\n",
    "        # Get unique peptides\n",
    "        peptides = selected_df.groupby('Stripped.Sequence')['10_Cells_Intensity'].mean().reset_index()\n",
    "\n",
    "        # Map peptides to positions\n",
    "        peptide_positions = []\n",
    "        for _, row in peptides.iterrows():\n",
    "            pep = row['Stripped.Sequence']\n",
    "            start = sequence.find(pep)\n",
    "            if start != -1:\n",
    "                end = start + len(pep)\n",
    "                intensity = row['10_Cells_Intensity']\n",
    "                peptide_positions.append({'pep': pep, 'start': start, 'end': end, 'intensity': intensity, 'length': end - start})\n",
    "\n",
    "        if not peptide_positions:\n",
    "            print(\"No peptides mapped to the sequence.\")\n",
    "            return\n",
    "\n",
    "        # Scaling logic\n",
    "        intensities = [p['intensity'] for p in peptide_positions]\n",
    "        num_peptides = len(intensities)\n",
    "        if scale_type == 'z-score' and num_peptides >= 3:\n",
    "            scaled_values = zscore(intensities)\n",
    "            print(\"Using Z-score scaling.\")\n",
    "        else:\n",
    "            scaled_values = [math.log10(i + 1) for i in intensities]\n",
    "            print(\"Using Log scaling (due to choice or <3 peptides).\")\n",
    "\n",
    "        # Update peptide_positions with scaled values\n",
    "        for i, pos in enumerate(peptide_positions):\n",
    "            pos['scaled'] = scaled_values[i]\n",
    "\n",
    "        # Detect overlaps\n",
    "        ranges = sorted([(p['start'], p['end']) for p in peptide_positions])\n",
    "        overlaps = False\n",
    "        for i in range(1, len(ranges)):\n",
    "            if ranges[i][0] < ranges[i-1][1]:\n",
    "                overlaps = True\n",
    "                break\n",
    "        if overlaps:\n",
    "            print(f\"Overlapping peptides detected. Using strategy: {overlap_strategy}\")\n",
    "\n",
    "        # Create per-residue scaled value\n",
    "        residue_scaled = [None] * seq_len\n",
    "        for pos in peptide_positions:\n",
    "            for i in range(pos['start'], pos['end']):\n",
    "                if residue_scaled[i] is None:\n",
    "                    residue_scaled[i] = [pos['scaled']]\n",
    "                else:\n",
    "                    residue_scaled[i].append(pos['scaled'])\n",
    "\n",
    "        # Aggregate per residue\n",
    "        for i in range(seq_len):\n",
    "            if residue_scaled[i]:\n",
    "                if overlap_strategy == 'merge':\n",
    "                    residue_scaled[i] = sum(residue_scaled[i]) / len(residue_scaled[i])\n",
    "                elif overlap_strategy == 'highest':\n",
    "                    residue_scaled[i] = max(residue_scaled[i])\n",
    "                else:\n",
    "                    residue_scaled[i] = residue_scaled[i][-1]\n",
    "\n",
    "        # Min/max for normalization\n",
    "        covered_scaled = [val for val in residue_scaled if val is not None]\n",
    "        if covered_scaled:\n",
    "            min_val = min(covered_scaled)\n",
    "            max_val = max(covered_scaled)\n",
    "        else:\n",
    "            print(\"No valid values for coloring.\")\n",
    "            return\n",
    "\n",
    "        # Color map: RdYlGn\n",
    "        cmap = cm.get_cmap('RdYlGn')\n",
    "\n",
    "        # Step 5: AlphaFold\n",
    "        base_id = protein_of_interest.split('-')[0]\n",
    "        if '-' in protein_of_interest or len(isoforms) > 1:\n",
    "            print(f\"Disclaimer: Using base ID ({base_id}) for AlphaFold.\")\n",
    "\n",
    "        pdb_url = f\"https://alphafold.ebi.ac.uk/files/AF-{base_id}-F1-model_v4.pdb\"\n",
    "        pdb_response = requests.get(pdb_url)\n",
    "        if pdb_response.status_code != 200:\n",
    "            print(f\"Failed to fetch AlphaFold for {base_id} (HTTP {pdb_response.status_code})\")\n",
    "            return\n",
    "        pdb_str = pdb_response.text\n",
    "\n",
    "        # Debug: Check PDB content\n",
    "        print(f\"PDB content starts with: {pdb_str[:100]}...\")  # Show first 100 chars\n",
    "        if not pdb_str.strip() or pdb_str.startswith('<!DOCTYPE'):\n",
    "            print(\"Error: Invalid PDB content (likely HTML). Using fallback test structure.\")\n",
    "            pdb_str = \"\"\"ATOM      1  N   ALA A  12      12.000   0.000   0.000  1.00  0.00           N  \n",
    "ATOM      2  CA  ALA A  12      13.000   0.000   0.000  1.00  0.00           C  \n",
    "ATOM      3  C   ALA A  12      13.000   1.000   0.000  1.00  0.00           C  \n",
    "TER\n",
    "\"\"\"\n",
    "\n",
    "        # 3D View\n",
    "        try:\n",
    "            view = py3Dmol.view(width=800, height=500)\n",
    "            view.addModel(pdb_str, 'pdb')\n",
    "            view.setBackgroundColor(bg_color)\n",
    "            view.setStyle({}, {'cartoon': {'color': 'lightgray'}})\n",
    "\n",
    "            for i in range(seq_len):\n",
    "                if residue_scaled[i] is not None:\n",
    "                    norm = (residue_scaled[i] - min_val) / (max_val - min_val) if max_val > min_val else 0.5\n",
    "                    color_hex = colors.rgb2hex(cmap(norm)[:3])\n",
    "                    view.setStyle({'resi': str(i + 1)}, {'cartoon': {'color': color_hex}})\n",
    "\n",
    "            view.zoomTo()\n",
    "            display(view.show())\n",
    "            print(\"3D structure rendered successfully.\")\n",
    "        except Exception as e:\n",
    "            print(f\"Error rendering 3D structure: {str(e)}\")\n",
    "\n",
    "        # Linear Representation\n",
    "        fig_lin, ax_lin = plt.subplots(figsize=(12, 1))\n",
    "        ax_lin.add_patch(patches.Rectangle((0, 0), seq_len, 1, facecolor='lightgray', edgecolor='none'))\n",
    "\n",
    "        for i in range(seq_len):\n",
    "            if residue_scaled[i] is not None:\n",
    "                norm = (residue_scaled[i] - min_val) / (max_val - min_val) if max_val > min_val else 0.5\n",
    "                color_rgb = cmap(norm)[:3]\n",
    "                ax_lin.add_patch(patches.Rectangle((i, 0), 1, 1, facecolor=color_rgb, edgecolor='none'))\n",
    "\n",
    "        ax_lin.set_xlim(0, seq_len)\n",
    "        ax_lin.set_ylim(0, 1)\n",
    "        ax_lin.set_yticks([])\n",
    "        ax_lin.set_xlabel('Amino Acid Position', fontsize=8)\n",
    "        ax_lin.set_xticks(range(0, seq_len + 1, max(1, seq_len // 10)))\n",
    "        ax_lin.tick_params(axis='x', labelsize=6)\n",
    "        display(fig_lin)\n",
    "\n",
    "        # Single Colorbar for linear\n",
    "        fig_cb_lin, ax_cb_lin = plt.subplots(figsize=(4, 0.5))\n",
    "        norm_cb = colors.Normalize(vmin=min_val, vmax=max_val)\n",
    "        cb_lin = plt.colorbar(plt.cm.ScalarMappable(norm=norm_cb, cmap=cmap), cax=ax_cb_lin, orientation='horizontal')\n",
    "        cb_lin.set_label('Scaled Intensity (Log or Z-score)', fontsize=8)\n",
    "        cb_lin.ax.tick_params(labelsize=6)\n",
    "        display(fig_cb_lin)\n",
    "\n",
    "        # Prepare CSV for download\n",
    "        peptides_df = pd.DataFrame(peptide_positions)[['pep', 'intensity', 'length']]\n",
    "        peptides_df['protein'] = protein_of_interest\n",
    "        peptides_df = peptides_df[['protein', 'pep', 'intensity', 'length']]\n",
    "\n",
    "        # Store for download\n",
    "        global_peptides_df = peptides_df\n",
    "        global_pdb_str = pdb_str\n",
    "        global_fig_lin = fig_lin\n",
    "\n",
    "# Function to download files\n",
    "def on_download(b):\n",
    "    with output:\n",
    "        clear_output(wait=True)\n",
    "        # PDB\n",
    "        with open('protein.pdb', 'w') as f:\n",
    "            f.write(global_pdb_str)\n",
    "        with open('protein.pdb', 'rb') as f:\n",
    "            pdb_data = f.read()\n",
    "        b64_pdb = b64encode(pdb_data).decode()\n",
    "        display(Javascript(f\"window.location.href='data:application/octet-stream;base64,{b64_pdb}'\"))\n",
    "\n",
    "        # PML\n",
    "        pml = f\"load protein.pdb\\nhide everything\\nshow cartoon\\ncolor gray90, all\\n\"\n",
    "        color_index = 0\n",
    "        prev_color = None\n",
    "        start_seg = None\n",
    "        for i in range(len(residue_scaled) + 1):\n",
    "            current = residue_scaled[i] if i < len(residue_scaled) else None\n",
    "            if current != prev_color:\n",
    "                if start_seg is not None and prev_color is not None:\n",
    "                    norm = (prev_color - min_val) / (max_val - min_val) if max_val > min_val else 0.5\n",
    "                    rgb = cmap(norm)[:3]\n",
    "                    pml += f\"set_color color{color_index}, [{rgb[0]}, {rgb[1]}, {rgb[2]}]\\n\"\n",
    "                    pml += f\"color color{color_index}, resi {start_seg + 1}-{i}\\n\"\n",
    "                    color_index += 1\n",
    "                start_seg = i if current is not None else None\n",
    "            prev_color = current\n",
    "        pml += \"zoom\\n\"\n",
    "        with open('pymol_script.pml', 'w') as f:\n",
    "            f.write(pml)\n",
    "        with open('pymol_script.pml', 'rb') as f:\n",
    "            pml_data = f.read()\n",
    "        b64_pml = b64encode(pml_data).decode()\n",
    "        display(Javascript(f\"window.location.href='data:application/octet-stream;base64,{b64_pml}'\"))\n",
    "\n",
    "        # CSV\n",
    "        global_peptides_df.to_csv('peptides.csv', index=False)\n",
    "        with open('peptides.csv', 'rb') as f:\n",
    "            csv_data = f.read()\n",
    "        b64_csv = b64encode(csv_data).decode()\n",
    "        display(Javascript(f\"window.location.href='data:application/octet-stream;base64,{b64_csv}'\"))\n",
    "\n",
    "        # JPEG\n",
    "        global_fig_lin.savefig('linear_protein.jpeg', dpi=600, transparent=True, bbox_inches='tight')\n",
    "        with open('linear_protein.jpeg', 'rb') as f:\n",
    "            jpeg_data = f.read()\n",
    "        b64_jpeg = b64encode(jpeg_data).decode()\n",
    "        display(Javascript(f\"window.location.href='data:image/jpeg;base64,{b64_jpeg}'\"))\n",
    "\n",
    "# Trigger analysis on widget changes\n",
    "def on_widget_change(change):\n",
    "    if any(w.value for w in [csv_upload, fasta_upload, protein_dropdown] if hasattr(w, 'value')):\n",
    "        run_analysis()\n",
    "\n",
    "csv_upload.observe(on_widget_change, names='value')\n",
    "fasta_upload.observe(on_widget_change, names='value')\n",
    "protein_dropdown.observe(on_widget_change, names='value')\n",
    "scale_dropdown.observe(on_widget_change, names='value')\n",
    "overlap_dropdown.observe(on_widget_change, names='value')\n",
    "combine_isoforms.observe(on_widget_change, names='value')\n",
    "isoform_select.observe(on_widget_change, names='value')\n",
    "bg_dropdown.observe(on_widget_change, names='value')\n",
    "download_btn.on_click(on_download)\n",
    "\n",
    "# Display widgets\n",
    "display(csv_upload, fasta_upload, species_input, search_input, protein_dropdown, scale_dropdown, overlap_dropdown, combine_isoforms, isoform_select, bg_dropdown, download_btn, output)\n",
    "\n",
    "# Initial run to populate proteins after upload\n",
    "update_proteins()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}