<a href="https://colab.research.google.com/github/Ash100/Minor/blob/main/Extracting-motif-stritches-and-phylogeny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython seaborn matplotlib

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
# Install MAFFT
!apt-get -qq update
!apt-get -qq install mafft

In [None]:
!mafft --version

In [6]:
#@title Extract 10aa–Motif–10aa patch only for the motif in "Motif sequence" column
import os
import pandas as pd

# ==== Settings ====
csv_path = "/content/ncbi_refseq_with_lxxll_motif.csv"   # <-- update your CSV path
seq_col = "Sequence"
id_col = "ID"
motif_col = "Motif Sequence"
new_col = "Extracted Motif"
flank = 10

# ==== Load CSV ====
df = pd.read_csv(csv_path)

# ==== Extract patches ====
patches = []
fasta_entries = []

for idx, row in df.iterrows():
    seq_id = str(row.get(id_col, f"row{idx}"))
    sequence = str(row[seq_col]).strip()
    motif = str(row[motif_col]).strip()

    patch_seq = ""
    if motif and motif in sequence:
        start = sequence.find(motif)
        end = start + len(motif)

        win_start = max(0, start - flank)
        win_end   = min(len(sequence), end + flank)

        patch_seq = sequence[win_start:win_end]

        # FASTA header with 1-based motif position
        header = f">{seq_id}|{motif}@{start+1}-{end}"
        fasta_entries.append((header, patch_seq))

    patches.append(patch_seq)

# ==== Insert new column ====
df.insert(len(df.columns), new_col, patches)

# ==== Save updated CSV ====
base, ext = os.path.splitext(csv_path)
output_csv = f"{base}_with_patch{ext or '.csv'}"
df.to_csv(output_csv, index=False)

# ==== Save FASTA ====
fasta_path = f"{base}_motif_patches.fasta"
with open(fasta_path, "w") as f:
    for header, seq in fasta_entries:
        f.write(f"{header}\n{seq}\n")

print(f"✅ Processed {len(df)} rows from: {csv_path}")
print(f"✅ Updated CSV saved to: {output_csv}")
print(f"✅ FASTA saved to: {fasta_path}")


✅ Processed 355 rows from: /content/ncbi_refseq_with_lxxll_motif.csv
✅ Updated CSV saved to: /content/ncbi_refseq_with_lxxll_motif_with_patch.csv
✅ FASTA saved to: /content/ncbi_refseq_with_lxxll_motif_motif_patches.fasta


In [None]:
#@title running mafft on .fasta file
# Align sequences using MAFFT
!mafft --auto /content/combined_LXXLL.fasta > aligned_combined_sequences.fasta

In [None]:
!head aligned_sequences.fasta

head: cannot open 'aligned_sequences.fasta' for reading: No such file or directory


In [None]:
#@title Compare pairwise distances
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator

alignment = AlignIO.read("/content/aligned_combined_sequences.fasta", "fasta")
calculator = DistanceCalculator("blosum62")
dm = calculator.get_distance(alignment)


In [None]:
#@title build phylogenetic tree
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)  # or .upgma(dm)
Phylo.write(tree, "motif_tree.nwk", "newick")
Phylo.draw(tree)  # or use iTOL for interactive visualization
