In [1]:
# import request
import re
from itertools import islice

In [2]:
gene_pfam_list = []

current_pfam = ""

with open("../raw/Pfam-A.seed") as file:
    for line in file:
        line = line.strip()
        if (line.startswith("#=GF AC")):
            current_pfam = line.split(" ")[-1]
            
        elif (line.startswith("#=GS ") and "AC " in line):

            match = re.match(r"#=GS\s+(\S+?)/(\d+-\d+)\s+AC\s+(\S+)", line)
            if match:
                residue_range = match.group(2).split("-")  
                accession = match.group(3)      
            gene_pfam_list.append( [accession, current_pfam, int(residue_range[0]), int(residue_range[1])])

In [3]:
for i in gene_pfam_list[:10]:
    print(i)

print(len(gene_pfam_list))

['A1AL94.1', 'PF10417.14', 160, 195]
['A0A0F7V1V9.1', 'PF10417.14', 243, 280]
['Q4UGC8.1', 'PF10417.14', 223, 258]
['A4H879.1', 'PF10417.14', 162, 198]
['A0A0M5IXP6.1', 'PF10417.14', 166, 205]
['Q1QPP6.1', 'PF10417.14', 169, 202]
['A0A8G2FVQ7.1', 'PF10417.14', 153, 192]
['Q30Y90.1', 'PF10417.14', 159, 202]
['Q9Y9L0.1', 'PF10417.14', 160, 203]
['Q5JF30.1', 'PF10417.14', 155, 198]
1404649


In [16]:
unique_genes = set()
for i in gene_pfam_list:
    unique_genes.add(i[0])

unique_genes = list(unique_genes)
print(len(unique_genes))

1247434


In [8]:
unique_genes = list(unique_genes)

chunk_size = 100000
for idx in range(0, len(unique_genes), chunk_size):
    chunk = unique_genes[idx:idx + chunk_size]
    filename = f"unique_genes_chunk_{idx // chunk_size + 1}.txt"
    with open(filename, "w") as f:
        f.write(" ".join(chunk))


In [15]:
import requests
import os

def fetch_uniprot_fasta_batch(accessions, batch_size=10000, output_prefix="sequences_batch"):
    url = "https://rest.uniprot.org/uniprotkb/stream"
    # os.makedirs("batches", exist_ok=True)  # Save files in a folder named 'batches'

    for i in range(0, int(len(accessions)/50), batch_size):
        batch = accessions[i:i + batch_size]
        params = {
            "format": "fasta",
            "query": " OR ".join(batch)
        }
        response = requests.get(url, params=params)

        batch_number = i // batch_size + 1
        output_file = f"sequences/{output_prefix}_{batch_number}.fasta"

        if response.ok:
            with open(output_file, "w") as f:
                f.write(response.text)
            print(f"Batch {batch_number} saved to {output_file} ({len(batch)} entries).")
        else:
            print(f"Failed to retrieve batch {batch_number}: {response.status_code}")



fetch_uniprot_fasta_batch(unique_genes, batch_size=1000)

Batch 1 saved to sequences/sequences_batch_1.fasta (1000 entries).
Batch 2 saved to sequences/sequences_batch_2.fasta (1000 entries).
Batch 3 saved to sequences/sequences_batch_3.fasta (1000 entries).
Batch 4 saved to sequences/sequences_batch_4.fasta (1000 entries).
Batch 5 saved to sequences/sequences_batch_5.fasta (1000 entries).
Batch 6 saved to sequences/sequences_batch_6.fasta (1000 entries).
Batch 7 saved to sequences/sequences_batch_7.fasta (1000 entries).
Batch 8 saved to sequences/sequences_batch_8.fasta (1000 entries).
Batch 9 saved to sequences/sequences_batch_9.fasta (1000 entries).
Batch 10 saved to sequences/sequences_batch_10.fasta (1000 entries).
Batch 11 saved to sequences/sequences_batch_11.fasta (1000 entries).
Batch 12 saved to sequences/sequences_batch_12.fasta (1000 entries).
Batch 13 saved to sequences/sequences_batch_13.fasta (1000 entries).
Batch 14 saved to sequences/sequences_batch_14.fasta (1000 entries).
Batch 15 saved to sequences/sequences_batch_15.fasta

In [37]:
from Bio import SeqIO
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

with open('sequences/sequences_batch_10.fasta') as fasta_file:  # Will close handle cleanly
    identifiers = []
    sequences = []
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split("|")[1])
        sequences.append(sequence)


In [27]:
unique_genes[:10]

['D9QDE2.1',
 'A0A1H3DFC8.1',
 'M9TCE6.1',
 'Q5KFS3.1',
 'S2JEP5.1',
 'A0A0K1XCU9.1',
 'K7ENQ8.2',
 'A9CHP7.1',
 'Q9Y959.2',
 'A0A4P7JMD4.1']

In [38]:
print(len(identifiers))

print(len(set(identifiers)))

951
951


In [41]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd

identifiers = []
sequences = []

# Loop through batch files 1 to 10
for i in range(1, 11):
    filepath = f'sequences/sequences_batch_{i}.fasta'
    with open(filepath) as fasta_file:
        for title, sequence in SimpleFastaParser(fasta_file):
            identifiers.append(title.split("|")[1])
            sequences.append(sequence)

# Create DataFrame
df = pd.DataFrame({'id': identifiers, 'sequence': sequences})

df['pfams'] = df['sequence'].apply(lambda seq: [None] * len(seq))

In [42]:
df

Unnamed: 0,id,sequence,pfams
0,B6HMS2,MGEVFLDKTISLLSSDKQKARSDGLAVFHLPRQTLNDKACHKIFES...,"[None, None, None, None, None, None, None, Non..."
1,O13688,MNASNNISKFPDLDNSSKLIDHILDSDDSEELDELPDISSLVPSAR...,"[None, None, None, None, None, None, None, Non..."
2,O13710,MDGLRPSKRRKSNPLYSDYALGSIVRIKLVNFVTYDYCELFPGPYL...,"[None, None, None, None, None, None, None, Non..."
3,O14646,MNGHSDEESVRNSSGESSQSDDDSGSASGSGSGSSSGSSSDGSSSQ...,"[None, None, None, None, None, None, None, Non..."
4,O60341,MLSGKKAAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSG...,"[None, None, None, None, None, None, None, Non..."
...,...,...,...
9324,W1QJR7,MPPKKAQKAQKDAEKKKNAKKVQKQLEDKTFGLKNKNKSKKVQQYV...,"[None, None, None, None, None, None, None, Non..."
9325,W5TR13,MAMPDDSEPPPRTRRRTRITLTGDGPALIEGPVELLTADGRTVRSD...,"[None, None, None, None, None, None, None, Non..."
9326,W7YA85,MKKSKLKVKKHLNRYERFIGHTYVFLMFAAILGSCTYFLLKQNKDL...,"[None, None, None, None, None, None, None, Non..."
9327,W8KFH6,MLLKIRDKASGWFAYAIIIMITIPFALWGVHEYFGGGARLVAAEVN...,"[None, None, None, None, None, None, None, Non..."


In [46]:
# Populate pfams

for id, pfam, start, end in gene_pfam_list[:10000]:
    # Get the index of the row where 'id' matches
    idx = df[df['id'] == id].index
    if not idx.empty:
        i = idx[0]  # assuming 'id' is unique
        df.at[i, 'pfams'][start:end] = [pfam] * (end - start)


In [48]:
view(df)

NameError: name 'view' is not defined

In [49]:
import itables

itables.init_notebook_mode()


ModuleNotFoundError: No module named 'itables'