<a href="https://colab.research.google.com/github/Ash100/Minor/blob/main/Viral_Protein_Motif_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The following notebook is designed to launch a search against specific motif in Viral Proteins.

In [None]:
!pip -q install biopython pandas tqdm requests matplotlib

In [2]:
import re
import io
import os
import sys
import json
import time
import gzip
import math
import textwrap
import requests
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from Bio import Entrez, SeqIO

# Always set your email for NCBI Entrez
Entrez.email = "ashfaqahmad82@hotmail.com"


In [None]:
#More Specialized code
import os
from Bio import Entrez, SeqIO
import pandas as pd
import time
import re

# Replace with your actual email (NCBI requires this for Entrez access)
Entrez.email = "ashfaqahmad82@hotmail.com"

# List of virus names
viruses = [
    "Herpes Simplex Virus 1",
    "Herpes Simplex Virus 2",
    "Varicella-Zoster Virus",
    "Epstein-Barr Virus",
    "Cytomegalovirus",
    "Human Herpesvirus 6",
    "Human Herpesvirus 7",
    "Kaposi’s Sarcoma-Associated Herpesvirus",
    "Variola Virus",
    "Monkeypox Virus",
    "Molluscum Contagiosum Virus",
    "Cowpox Virus",
    "Human Papillomavirus",
    "BK Virus",
    "JC Virus",
    "Merkel Cell Polyomavirus",
    "Human Adenovirus",
    "Parvovirus B19",
    "Hepatitis B Virus",
    "Influenza A Virus",
    "Influenza B Virus",
    "Influenza C Virus",
    "Influenza D Virus",
    "SARS-CoV",
    "MERS-CoV",
    "SARS-CoV-2",
    "Human Coronavirus 229E",
    "Human Coronavirus NL63",
    "Human Coronavirus OC43",
    "Human Coronavirus HKU1",
    "Poliovirus",
    "Coxsackievirus A",
    "Coxsackievirus B",
    "Echovirus",
    "Enterovirus",
    "Rhinovirus",
    "Hepatitis A Virus",
    "Dengue Virus",
    "Zika Virus",
    "West Nile Virus",
    "Yellow Fever Virus",
    "Hepatitis C Virus",
    "Japanese Encephalitis Virus",
    "Tick-Borne Encephalitis Virus",
    "Chikungunya Virus",
    "Rubella Virus",
    "Ross River Virus",
    "Rabies Virus",
    "Measles Virus",
    "Mumps Virus",
    "Respiratory Syncytial Virus",
    "Human Metapneumovirus",
    "Parainfluenza Virus",
    "Hantavirus",
    "Hantaan Virus",
    "Sin Nombre Virus",
    "Crimean-Congo Hemorrhagic Fever Virus",
    "Lassa Virus",
    "Lymphocytic Choriomeningitis Virus",
    "Ebola Virus",
    "Marburg Virus",
    "Rotavirus",
    "Colorado Tick Fever Virus",
    "Norovirus",
    "Sapovirus",
    "Human Astrovirus",
    "Hepatitis E Virus",
    "Human Immunodeficiency Virus 1",
    "Human Immunodeficiency Virus 2",
    "Human T-Lymphotropic Virus 1",
    "Human T-Lymphotropic Virus 2",
    "Nipah Virus",
    "Hendra Virus",
    "Rift Valley Fever Virus"
]

# Initialize list to store data
data = []

for virus in viruses:
    print(f"Fetching proteins for {virus}")
    try:
        # Search for protein sequences in NCBI Protein database
        # Limit to 10 sequences per virus to avoid overwhelming downloads (adjust retmax as needed)
        handle = Entrez.esearch(db="protein", term=f"{virus}[Organism]", retmax=20)
        record = Entrez.read(handle)
        handle.close()

        id_list = record["IdList"]

        if not id_list:
            print(f"No proteins found for {virus}")
            continue

        # Fetch the sequences in FASTA format
        fetch_handle = Entrez.efetch(db="protein", id=",".join(id_list), rettype="fasta", retmode="text")
        fasta_data = fetch_handle.read()
        fetch_handle.close()

        # Parse FASTA data
        from io import StringIO
        fasta_io = StringIO(fasta_data)
        for record in SeqIO.parse(fasta_io, "fasta"):
            # Extract ID and sequence
            seq_id = record.id
            sequence = str(record.seq)

            # Extract protein name from description (before first '[' or '|')
            description = record.description
            protein_name = re.split(r'\[|\|', description)[0].replace(f">{seq_id} ", "").strip()
            if not protein_name:
                protein_name = "Unknown"

            # Append to data list
            data.append({
                "ID": seq_id,
                "Sequence": sequence,
                "Virus Name": virus,
                "Protein Name": protein_name
            })

        print(f"Processed {len(id_list)} sequences for {virus}")

        # Sleep to respect NCBI rate limits (max 3 requests per second without API key)
        time.sleep(0.5)

    except Exception as e:
        print(f"Error fetching for {virus}: {e}")
        time.sleep(1)  # Longer sleep on error

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
output_file = "virus_protein_sequences.csv"
df.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")

In [None]:
#Redundancy Filteration
import pandas as pd

# Load the CSV file generated from the previous step
input_file = "virus_protein_sequences.csv"
output_file = "filtered_virus_protein_sequences.csv"

try:
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Check if the required columns exist
    required_columns = ["ID", "Sequence", "Virus Name", "Protein Name"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("Input CSV must contain columns: ID, Sequence, Virus Name, Protein Name")

    print(f"Original dataset size: {len(df)} sequences")

    # Remove duplicates based on exact sequence matches
    df = df.drop_duplicates(subset=["Sequence"], keep="first")
    print(f"Dataset size after removing redundant sequences: {len(df)} sequences")

    # Remove duplicates based on Protein Name within each Virus Name, keeping the first occurrence
    df = df.drop_duplicates(subset=["Virus Name", "Protein Name"], keep="first")
    print(f"Dataset size after removing redundant protein names: {len(df)} sequences")

    # Save the filtered dataset to a new CSV
    df.to_csv(output_file, index=False)
    print(f"Filtered dataset saved to {output_file}")

except FileNotFoundError:
    print(f"Error: {input_file} not found. Please ensure the input CSV exists.")
except Exception as e:
    print(f"Error during processing: {e}")

In [None]:
import pandas as pd
import re

# Input file from previous step
input_file = "filtered_virus_protein_sequences.csv"
output_file = "virus_proteins_with_lxxll_motif.csv"

# Load the filtered dataset
try:
    df = pd.read_csv(input_file)

    # Check required columns
    required_columns = ["ID", "Sequence", "Virus Name", "Protein Name"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("Input CSV must contain columns: ID, Sequence, Virus Name, Protein Name")

    print(f"Processing {len(df)} sequences for LXXLL motif...")

    # List to store hits
    hits = []

    # Regex pattern for LXXLL where X is any amino acid (A-Z)
    pattern = re.compile(r'L[A-Z]{2}LL')

    for _, row in df.iterrows():
        sequence = row["Sequence"]
        virus = row["Virus Name"]
        protein_name = row["Protein Name"]
        seq_id = row["ID"]

        # Find all matches with positions (1-indexed)
        for match in pattern.finditer(sequence):
            start_pos = match.start() + 1  # 1-indexed
            motif_seq = match.group()
            hits.append({
                "ID": seq_id,
                "Sequence": sequence,  # Include full sequence
                "Virus Name": virus,
                "Protein Name": protein_name,
                "Motif Position": start_pos,
                "Motif Sequence": motif_seq
            })

    # Create DataFrame for hits
    hits_df = pd.DataFrame(hits)

    if hits_df.empty:
        print("No LXXLL motifs found in any sequences.")
    else:
        # Save to CSV with all columns
        hits_df.to_csv(output_file, index=False)
        print(f"Found {len(hits_df)} motif hits across {hits_df['ID'].nunique()} unique proteins.")
        print(f"Results saved to {output_file}")

        # Optional: Print a preview of hits
        print("\nPreview of hits:")
        # Truncate Sequence column for preview to avoid clutter
        preview_df = hits_df.copy()
        preview_df["Sequence"] = preview_df["Sequence"].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
        print(preview_df.head(10))  # Show first 10 hits

except FileNotFoundError:
    print(f"Error: {input_file} not found. Please ensure the filtered CSV exists.")
except Exception as e:
    print(f"Error during processing: {e}")

In [13]:
import pandas as pd
import re

# Input file from previous step
input_file = "filtered_virus_protein_sequences.csv"
output_file = "virus_proteins_with_lxxll_motif.csv"

# Load the filtered dataset
try:
    df = pd.read_csv(input_file)

    # Check required columns
    required_columns = ["ID", "Sequence", "Virus Name", "Protein Name"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("Input CSV must contain columns: ID, Sequence, Virus Name, Protein Name")

    print(f"Processing {len(df)} sequences for LXXLL motif...")

    # Dictionary to store hits
    hits_dict = {}

    # Regex pattern for LXXLL where X is any amino acid (A-Z)
    pattern = re.compile(r'L[A-Z]{2}LL')

    for _, row in df.iterrows():
        sequence = row["Sequence"]
        virus = row["Virus Name"]
        protein_name = row["Protein Name"]
        seq_id = row["ID"]

        # Find all matches with positions (1-indexed)
        matches = pattern.finditer(sequence)
        positions = []
        motifs = []
        for match in matches:
            start_pos = match.start() + 1  # 1-indexed
            motif_seq = match.group()
            positions.append(str(start_pos))
            motifs.append(motif_seq)

        if positions:  # Only add if motifs were found
            hits_dict[seq_id] = {
                "ID": seq_id,
                "Sequence": sequence,
                "Virus Name": virus,
                "Protein Name": protein_name,
                "Motif Positions": ",".join(positions),
                "Motif Sequences": ",".join(motifs)
            }

    # Create DataFrame from hits
    hits_df = pd.DataFrame(list(hits_dict.values()))

    if hits_df.empty:
        print("No LXXLL motifs found in any sequences.")
    else:
        # Save to CSV with all columns
        hits_df.to_csv(output_file, index=False)
        print(f"Found {len(hits_df)} proteins with LXXLL motifs.")
        print(f"Results saved to {output_file}")

        # Optional: Print a preview of hits
        print("\nPreview of hits:")
        # Truncate Sequence column for preview to avoid clutter
        preview_df = hits_df.copy()
        preview_df["Sequence"] = preview_df["Sequence"].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
        print(preview_df.head(10))  # Show first 10 hits

except FileNotFoundError:
    print(f"Error: {input_file} not found. Please ensure the filtered CSV exists.")
except Exception as e:
    print(f"Error during processing: {e}")

Processing 813 sequences for LXXLL motif...
Found 329 proteins with LXXLL motifs.
Results saved to virus_proteins_with_lxxll_motif.csv

Preview of hits:
           ID                                           Sequence  \
0  XYR92223.1  MQRRARGASSLRLARCLTPANLIRGANAGVPERRIFAGCLLPTPEG...   
1  XYR92222.1  MIPAALPHPTMKRQGDRDIVVTGVRNQFATDLEPGGSVSCMRSSLS...   
2  XYR92220.1  MSAEQRKKKKTTTTTTQGRGAEVAMADEDEGRLRAAAETTGGPGSP...   
3  XYR92216.1  MTSRPADQDSVRSSASVPLYPAASPVPAEAYYSESEDEAANDFLVR...   
4  XYR92208.1  MGVVVVSVVTLLDQRNALPRTSADASPALWSFLLRQCRILASEPLG...   
5  pdb|9NO1|X  MDIIPPIAVTVAGVGSRNQFDGALGPASGLSCLRTSLSFLHMTYAH...   
6  XAL74522.1  APGYAVEAVEGGLYPVARLDAWPYQGSQERLLVGQRTCGVTAASQG...   
7  XAL74519.1  MALSGHVLIDPARLPRDTGPELMWAPSLRNSLRVSPEALELAEREA...   
8  XAL74518.1  MDPTRGLCALSTHDLAKFHSLPPARKAAGKRAHLRCYSKLLSLKSW...   
9  XAL74517.1  MLKCKQPGARFIHGAVHLPSGQIVFHTIHSPTLASALGLPGENVPI...   

               Virus Name                                    Protein Name  \
0  Herpes Simplex Vir

In [15]:
import pandas as pd

# Input CSV from the consolidated motif script
input_file = "virus_proteins_with_lxxll_motif.csv"
output_file = "prioritized_virus_proteins_lxxll.csv"

# High-potential viruses based on literature
priority_viruses = [
    "SARS-CoV-2",
    "Human Immunodeficiency Virus 1",
    "Human Immunodeficiency Virus 2",
    "Kaposi’s Sarcoma-Associated Herpesvirus",
    "Hepatitis B Virus"
]

try:
    df = pd.read_csv(input_file)

    # Check required columns
    required_columns = ["ID", "Sequence", "Virus Name", "Protein Name", "Motif Positions", "Motif Sequences"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("Input CSV must contain columns: ID, Sequence, Virus Name, Protein Name, Motif Positions, Motif Sequences")

    # Filter to priority viruses
    filtered_df = df[df["Virus Name"].isin(priority_viruses)]

    if filtered_df.empty:
        print("No proteins found for priority viruses. Consider expanding the list.")
    else:
        # Calculate motif count from comma-separated Motif Positions
        filtered_df["Motif Count"] = filtered_df["Motif Positions"].apply(lambda x: len(x.split(",")) if pd.notna(x) else 0)

        # Sort by virus and motif count (descending)
        prioritized_df = filtered_df.sort_values(by=["Virus Name", "Motif Count"], ascending=[True, False])

        # Save to CSV
        prioritized_df.to_csv(output_file, index=False)
        print(f"Prioritized {len(prioritized_df)} proteins saved to {output_file}")
        print("\nPreview:")
        # Truncate Sequence for preview
        preview_df = prioritized_df.copy()
        preview_df["Sequence"] = preview_df["Sequence"].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
        print(preview_df[["ID", "Virus Name", "Protein Name", "Motif Count", "Motif Positions", "Motif Sequences"]].head(10))

except FileNotFoundError:
    print(f"Error: {input_file} not found.")
except Exception as e:
    print(f"Error: {e}")

Prioritized 34 proteins saved to prioritized_virus_proteins_lxxll.csv

Preview:
            ID         Virus Name                Protein Name  Motif Count  \
67  XYY31501.1  Hepatitis B Virus        XYY31501.1 S protein            3   
68  XYY31500.1  Hepatitis B Virus  XYY31500.1 large S protein            3   
70  XYY31496.1  Hepatitis B Virus        XYY31496.1 S protein            3   
71  XYY31495.1  Hepatitis B Virus  XYY31495.1 large S protein            3   
73  XYY31490.1  Hepatitis B Virus  XYY31490.1 large S protein            3   
75  XYY31486.1  Hepatitis B Virus        XYY31486.1 S protein            3   
76  XYY31485.1  Hepatitis B Virus  XYY31485.1 large S protein            3   
69  XYY31499.1  Hepatitis B Virus       XYY31499.1 polymerase            1   
72  XYY31494.1  Hepatitis B Virus       XYY31494.1 polymerase            1   
74  XYY31489.1  Hepatitis B Virus       XYY31489.1 polymerase            1   

   Motif Positions    Motif Sequences  
67         9,84,91  L

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Motif Count"] = filtered_df["Motif Positions"].apply(lambda x: len(x.split(",")) if pd.notna(x) else 0)
