In [6]:
!pip install biopython




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from google.colab import files
uploaded = files.upload()
from Bio import SeqIO

import re

import itertools

import pandas as pd



# Read the FASTA file

fasta_file = "uniprotkb_stress_response_gene_AND_revi_2025_07_04.fasta"



sequences = []

gene_names = []



for record in SeqIO.parse(fasta_file, "fasta"):

    seq = str(record.seq).upper()

    sequences.append(seq)



    match = re.search(r"GN=([^ ]+)", record.description)

    if match:

        gene_names.append(match.group(1))

    else:

        gene_names.append(None)



print(f"Total sequences read: {len(sequences)}")

print("First 3 sequences preview:")

for i, seq in enumerate(sequences[:3]):

    print(f"Seq{i+1} length={len(seq)}, first 30 aa: {seq[:30]}...")



print("\nFirst 3 extracted gene names:")

print(gene_names[:3])



# Step 2: Amino acid composition

amino_acids = list("ACDEFGHIKLMNPQRSTVWY")

print("\nList of amino acids considered:", amino_acids)



def aa_composition(seq):

    total_len = len(seq)

    comp = {aa: seq.count(aa) / total_len for aa in amino_acids}

    return comp



# Test AA composition on first sequence

print("\nAmino acid composition example for first sequence:")

print(aa_composition(sequences[0]))



# Step 3: Dipeptide composition

dipeptides = [''.join(p) for p in itertools.product(amino_acids, repeat=2)]

print(f"\nTotal dipeptides considered: {len(dipeptides)}")

print("First 10 dipeptides:", dipeptides[:10])



def dipeptide_composition(seq):

    total_dipeptides = len(seq) - 1

    counts = {dp: 0 for dp in dipeptides}

    for i in range(total_dipeptides):

        dp = seq[i:i+2]

        if dp in counts:

            counts[dp] += 1

    comp = {dp: counts[dp] / total_dipeptides for dp in dipeptides}

    return comp



# Test dipeptide composition on first sequence

print("\nDipeptide composition example for first sequence (first 10 values):")

dp_test = dipeptide_composition(sequences[0])

print({k: dp_test[k] for k in list(dp_test.keys())[:10]})



# Step 4: Create DataFrame for all sequences

print("\nGenerating full feature table...")

all_features = []

for idx, seq in enumerate(sequences):

    features = {}

    features.update(aa_composition(seq))

    features.update(dipeptide_composition(seq))

    all_features.append(features)

    if idx < 3:  # Print first 3 feature sets

        print(f"\nFeature vector preview for sequence {idx+1}:")

        print({k: features[k] for k in list(features.keys())[:10]})  # First 10 features only



df_features = pd.DataFrame(all_features)

df_features.insert(0, "GeneName", gene_names)



# Step 5: Save and final preview

output_file = "protein_features.csv"

df_features.to_csv(output_file, index=False)



print(f"\nFeature table shape: {df_features.shape}")

print("First 5 rows of feature table:")

print(df_features.head())



print(f"\nFeatures saved to: {output_file}")





from google.colab import files

files.download("protein_features.csv")