In [5]:
# import request
import re
from itertools import islice

In [6]:
gene_pfam_list = []

current_pfam = ""

with open("../raw/Pfam-A.seed") as file:
    for line in file:
        line = line.strip()
        if (line.startswith("#=GF AC")):
            current_pfam = line.split(" ")[-1]
            
        elif (line.startswith("#=GS ") and "AC " in line):

            match = re.match(r"#=GS\s+(\S+?)/(\d+-\d+)\s+AC\s+(\S+)", line)
            if match:
                residue_range = match.group(2).split("-")  
                accession = match.group(3)      
            gene_pfam_list.append( [accession, current_pfam, int(residue_range[0]), int(residue_range[1])])

In [3]:
for i in gene_pfam_list[:10]:
    print(i)

print(len(gene_pfam_list))

['A1AL94.1', 'PF10417.14', 160, 195]
['A0A0F7V1V9.1', 'PF10417.14', 243, 280]
['Q4UGC8.1', 'PF10417.14', 223, 258]
['A4H879.1', 'PF10417.14', 162, 198]
['A0A0M5IXP6.1', 'PF10417.14', 166, 205]
['Q1QPP6.1', 'PF10417.14', 169, 202]
['A0A8G2FVQ7.1', 'PF10417.14', 153, 192]
['Q30Y90.1', 'PF10417.14', 159, 202]
['Q9Y9L0.1', 'PF10417.14', 160, 203]
['Q5JF30.1', 'PF10417.14', 155, 198]
1404649


In [7]:
unique_genes = set()
for i in gene_pfam_list:
    unique_genes.add(i[0])

unique_genes = list(unique_genes)
print(len(unique_genes))

1247434


In [8]:
unique_genes = list(unique_genes)

chunk_size = 100000
for idx in range(0, len(unique_genes), chunk_size):
    chunk = unique_genes[idx:idx + chunk_size]
    filename = f"unique_genes_chunk_{idx // chunk_size + 1}.txt"
    with open(filename, "w") as f:
        f.write(" ".join(chunk))


In [None]:
import requests
import os

def fetch_uniprot_fasta_batch(accessions, batch_size=10000, output_prefix="sequences_batch"):
    url = "https://rest.uniprot.org/uniprotkb/stream"
    # os.makedirs("batches", exist_ok=True)  # Save files in a folder named 'batches'

    for i in range(480*1000, int(len(accessions)), batch_size):
        batch = accessions[i:i + batch_size]
        params = {
            "format": "fasta",
            "query": " OR ".join(batch)
        }
        response = requests.get(url, params=params)

        batch_number = i // batch_size + 1
        output_file = f"sequences/{output_prefix}_{batch_number}.fasta"

        if response.ok:
            with open(output_file, "w") as f:
                f.write(response.text)
            print(f"Batch {batch_number} saved to {output_file} ({len(batch)} entries).")
        else:
            print(f"Failed to retrieve batch {batch_number}: {response.status_code}")



fetch_uniprot_fasta_batch(unique_genes, batch_size=1000)

Batch 481 saved to sequences/sequences_batch_481.fasta (1000 entries).
Batch 482 saved to sequences/sequences_batch_482.fasta (1000 entries).
Batch 483 saved to sequences/sequences_batch_483.fasta (1000 entries).
Batch 484 saved to sequences/sequences_batch_484.fasta (1000 entries).
Batch 485 saved to sequences/sequences_batch_485.fasta (1000 entries).
Batch 486 saved to sequences/sequences_batch_486.fasta (1000 entries).
Batch 487 saved to sequences/sequences_batch_487.fasta (1000 entries).
Batch 488 saved to sequences/sequences_batch_488.fasta (1000 entries).
Batch 489 saved to sequences/sequences_batch_489.fasta (1000 entries).
Batch 490 saved to sequences/sequences_batch_490.fasta (1000 entries).
Batch 491 saved to sequences/sequences_batch_491.fasta (1000 entries).
Batch 492 saved to sequences/sequences_batch_492.fasta (1000 entries).
Batch 493 saved to sequences/sequences_batch_493.fasta (1000 entries).
Batch 494 saved to sequences/sequences_batch_494.fasta (1000 entries).
Batch 

In [37]:
from Bio import SeqIO
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

with open('sequences/sequences_batch_10.fasta') as fasta_file:  # Will close handle cleanly
    identifiers = []
    sequences = []
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split("|")[1])
        sequences.append(sequence)


In [27]:
unique_genes[:10]

['D9QDE2.1',
 'A0A1H3DFC8.1',
 'M9TCE6.1',
 'Q5KFS3.1',
 'S2JEP5.1',
 'A0A0K1XCU9.1',
 'K7ENQ8.2',
 'A9CHP7.1',
 'Q9Y959.2',
 'A0A4P7JMD4.1']

In [38]:
print(len(identifiers))

print(len(set(identifiers)))

951
951


In [34]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd

identifiers = []
sequences = []

# Loop through batch files 1 to 10
for i in range(1, 11):
    filepath = f'sequences/sequences_batch_{i}.fasta'
    with open(filepath) as fasta_file:
        for title, sequence in SimpleFastaParser(fasta_file):
            identifiers.append(title.split("|")[1])
            sequences.append(sequence)

# Create DataFrame
df = pd.DataFrame({'id': identifiers, 'sequence': sequences})

df['pfams'] = df['sequence'].apply(lambda seq: [None] * len(seq))

df.set_index('id', inplace=True)

In [35]:
df

Unnamed: 0_level_0,sequence,pfams
id,Unnamed: 1_level_1,Unnamed: 2_level_1
B6HMS2,MGEVFLDKTISLLSSDKQKARSDGLAVFHLPRQTLNDKACHKIFES...,"[None, None, None, None, None, None, None, Non..."
O13688,MNASNNISKFPDLDNSSKLIDHILDSDDSEELDELPDISSLVPSAR...,"[None, None, None, None, None, None, None, Non..."
O13710,MDGLRPSKRRKSNPLYSDYALGSIVRIKLVNFVTYDYCELFPGPYL...,"[None, None, None, None, None, None, None, Non..."
O14646,MNGHSDEESVRNSSGESSQSDDDSGSASGSGSGSSSGSSSDGSSSQ...,"[None, None, None, None, None, None, None, Non..."
O60341,MLSGKKAAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSG...,"[None, None, None, None, None, None, None, Non..."
...,...,...
W1QJR7,MPPKKAQKAQKDAEKKKNAKKVQKQLEDKTFGLKNKNKSKKVQQYV...,"[None, None, None, None, None, None, None, Non..."
W5TR13,MAMPDDSEPPPRTRRRTRITLTGDGPALIEGPVELLTADGRTVRSD...,"[None, None, None, None, None, None, None, Non..."
W7YA85,MKKSKLKVKKHLNRYERFIGHTYVFLMFAAILGSCTYFLLKQNKDL...,"[None, None, None, None, None, None, None, Non..."
W8KFH6,MLLKIRDKASGWFAYAIIIMITIPFALWGVHEYFGGGARLVAAEVN...,"[None, None, None, None, None, None, None, Non..."


In [39]:
from collections import defaultdict

# Step 1: Group updates by id
updates = defaultdict(list)
for id, pfam, start, end in gene_pfam_list:
    updates[id].append((start, end, pfam))

print("grouped")

# Step 2: Apply all updates per id
for id, changes in updates.items():
    # Ensure we're using the correct ID without the extra parts after '.'
    id_key = id.split('.')[0]

    if id_key in df.index:
        pfam_list = df.at[id_key, 'pfams']
        
        # Ensure pfam_list is a list
        if isinstance(pfam_list, list):
            for start, end, pfam in changes:
                # Ensure indices are within bounds
                if 0 <= start < end <= len(pfam_list):
                    # Safely modify the list slice
                    pfam_list[start:end] = [pfam] * (end - start)
                else:
                    print(f"Skipping invalid range: {id_key}, start={start}, end={end}, len(pfam_list)={len(pfam_list)}")
        else:
            print(f"Warning: 'pfams' for id={id_key} is not a list!")



grouped


In [51]:
sequences = df.loc["Q8NHU6"]["sequence"]

sequence_lengths = sequences.apply(len)
print(sequence_lengths)

id
Q8NHU6    1098
Q8NHU6    1098
Name: sequence, dtype: int64


In [41]:
non_empty_pfams = df[df['pfams'].apply(lambda pfams: any(p is not None for p in pfams))]
print(non_empty_pfams)


                                                 sequence  \
id                                                          
B6HMS2  MGEVFLDKTISLLSSDKQKARSDGLAVFHLPRQTLNDKACHKIFES...   
O13688  MNASNNISKFPDLDNSSKLIDHILDSDDSEELDELPDISSLVPSAR...   
O14646  MNGHSDEESVRNSSGESSQSDDDSGSASGSGSGSSSGSSSDGSSSQ...   
O60341  MLSGKKAAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSG...   
P0A6E6  MAMTYHLDVVSAEQQMFSGLVEKIQVTGSEGELGIYPGHAPLLTAI...   
...                                                   ...   
W1QJR7  MPPKKAQKAQKDAEKKKNAKKVQKQLEDKTFGLKNKNKSKKVQQYV...   
W5TR13  MAMPDDSEPPPRTRRRTRITLTGDGPALIEGPVELLTADGRTVRSD...   
W7YA85  MKKSKLKVKKHLNRYERFIGHTYVFLMFAAILGSCTYFLLKQNKDL...   
W8KFH6  MLLKIRDKASGWFAYAIIIMITIPFALWGVHEYFGGGARLVAAEVN...   
X0PSY2  MRTSKPYARLTKILITVLIVVFSILIAGGFLIFKNEAPRPAKIVNT...   

                                                    pfams  
id                                                         
B6HMS2  [None, None, None, None, None, None, PF11640.1...  
O13688  [None, PF08691.15,