<a href="https://colab.research.google.com/github/Ash100/Minor/blob/main/Extracting-motif-stritches-and-phylogeny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython seaborn matplotlib

In [None]:
# Install MAFFT
!apt-get -qq update
!apt-get -qq install mafft

In [None]:
!mafft --version

In [4]:
#@title reads .txt, scan LXXLL, extrat -10-LXXLL-10 aa stritch to .fasta
# 📌 Step 1: Import libraries
import re

# 📌 Step 2: Define motif pattern
motif_pattern = re.compile(r'L.{2}LL')

# 📌 Step 3: Load FASTA sequences
file_path = '/content/controlled_condidates_LXXLL.txt'  # Update if needed
sequences = {}
current_name = None

with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            current_name = line[1:].strip()
            sequences[current_name] = ''
        elif current_name:
            sequences[current_name] += line

# 📌 Step 4: Extract motif windows
motif_windows = []
for name, seq in sequences.items():
    for match in motif_pattern.finditer(seq):
        start = match.start()
        end = match.end()
        window_start = max(0, start - 10)
        window_end = min(len(seq), end + 10)
        window_seq = seq[window_start:window_end]
        header = f">{name}|LXXLL@{start}-{end}"
        motif_windows.append((header, window_seq))

# 📌 Step 5: Write to FASTA file
output_path = '/content/LXXLL_windows.fasta'
with open(output_path, 'w') as f:
    for header, window_seq in motif_windows:
        f.write(f"{header}\n{window_seq}\n")

print(f"✅ Extracted {len(motif_windows)} motif windows to: {output_path}")


✅ Extracted 23 motif windows to: /content/LXXLL_windows.fasta


In [None]:
#@title reads .csv and perform the same function as above
# 📌 Step 1: Import libraries
import pandas as pd
import re

# 📌 Step 2: Define file path and motif pattern
csv_path = '/content/human_viral_proteins_with_lxxll_motif.csv'  # Update with your actual file name
motif_pattern = re.compile(r'L.{2}LL')

# 📌 Step 3: Load CSV and extract relevant columns
df = pd.read_csv(csv_path)

# Replace these with your actual column names
id_column = 'ID'         # e.g., UniProt ID or gene name
seq_column = 'Sequence'  # column containing protein sequence

# 📌 Step 4: Scan sequences and extract motif windows
fasta_entries = []

for _, row in df.iterrows():
    seq_id = str(row[id_column])
    sequence = str(row[seq_column]).strip()

    for match in motif_pattern.finditer(sequence):
        start = match.start()
        end = match.end()
        window_start = max(0, start - 10)
        window_end = min(len(sequence), end + 10)
        window_seq = sequence[window_start:window_end]

        header = f">{seq_id}|LXXLL@{start}-{end}"
        fasta_entries.append((header, window_seq))

# 📌 Step 5: Write to FASTA file
fasta_path = '/content/test_LXXLL_windows_from_csv.fasta'
with open(fasta_path, 'w') as f:
    for header, seq in fasta_entries:
        f.write(f"{header}\n{seq}\n")

print(f"✅ Extracted {len(fasta_entries)} motif windows to: {fasta_path}")


In [None]:
#@title running mafft on .fasta file
# Align sequences using MAFFT
!mafft --auto /content/combined_LXXLL.fasta > aligned_combined_sequences.fasta

In [12]:
!head aligned_sequences.fasta

head: cannot open 'aligned_sequences.fasta' for reading: No such file or directory


In [14]:
#@title Compare pairwise distances
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator

alignment = AlignIO.read("/content/aligned_combined_sequences.fasta", "fasta")
calculator = DistanceCalculator("blosum62")
dm = calculator.get_distance(alignment)


In [None]:
#@title build phylogenetic tree
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)  # or .upgma(dm)
Phylo.write(tree, "motif_tree.nwk", "newick")
Phylo.draw(tree)  # or use iTOL for interactive visualization
