<a href="https://colab.research.google.com/github/AhmedAboushanab/My-Python-Works/blob/master/DNA_Sequence_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython pandas


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
from google.colab import files

def extract_sequences(fasta_path, coords_csv_path, output_fasta_path, output_csv_path):
    sequences = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
    df = pd.read_csv(coords_csv_path)

    extracted_records = []
    extracted_fasta = []

    for idx, row in df.iterrows():
        seq_id = row['Sample-id']
        # Remove commas before converting to integer
        start = int(str(row['From']).replace(',', '')) - 1  # Convert to 0-based
        end = int(str(row['To']).replace(',', ''))
        strand = row.get('strand', '+')

        if seq_id not in sequences:
            print(f"Warning: {seq_id} not found in {fasta_path}")
            continue

        full_seq = sequences[seq_id].seq
        sub_seq = full_seq[start:end]
        if strand == '-':
            sub_seq = sub_seq.reverse_complement()

        record_id = f"{seq_id}_{start+1}_{end}_{strand}"
        extracted_fasta.append(f">{record_id}\n{sub_seq}")
        extracted_records.append({
            'record_id': record_id,
            'seq_id': seq_id,
            'start': start + 1,
            'end': end,
            'strand': strand,
            'sequence': str(sub_seq)
        })

    # Save output
    with open(output_fasta_path, "w") as f:
        f.write("\n".join(extracted_fasta))

    pd.DataFrame(extracted_records).to_csv(output_csv_path, index=False)

    # Offer files for download
    files.download(output_fasta_path)
    files.download(output_csv_path)

# Main execution: loop over all FASTA files
def process_all_fastas(coords_csv_path):
    for file in os.listdir():
        if file.endswith(".fasta") or file.endswith(".fa"):
            base = os.path.splitext(file)[0]
            print(f"Processing {file}...")
            extract_sequences(
                fasta_path=file,
                coords_csv_path=coords_csv_path,
                output_fasta_path=f"{base}_extracted.fasta",
                output_csv_path=f"{base}_extracted.csv"
            )

# Usage:
# First upload all your FASTA files and your coordinates.csv
# Then call this line (adjust the filename if needed)
# process_all_fastas("coordinates.csv")


In [22]:
process_all_fastas("Coordinates.csv")

Processing Reference-3.fa...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing Reference-1.fa...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing output_sequences.fasta...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>