In [5]:
from tqdm import tqdm
import pandas as pd
import requests
import time
import re

In [4]:
df = pd.read_csv("df_Intensity_DEinfo.csv")

def fetch_sequence(uniprot_id, retries=3, delay=2):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    for attempt in range(retries):
        try:
            r = requests.get(url, timeout=10)
            if r.ok:
                lines = r.text.split('\n')
                return ''.join(lines[1:])
            else:
                print(f"HTTP error {r.status_code} for {uniprot_id}")
        except requests.exceptions.RequestException as e:
            print(f"Connection error for {uniprot_id}: {e}")
            time.sleep(delay)
    return None

sequence_cache = {}
windows = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    uniprot_id = row['MajorProtein']
    pos = int(row['Position'])

    if uniprot_id not in sequence_cache:
        sequence_cache[uniprot_id] = fetch_sequence(uniprot_id)
        time.sleep(0.5)  # Throttle requests

    seq = sequence_cache[uniprot_id]
    if seq:
        start = max(0, pos - 11)
        end = min(len(seq), pos + 10)
        windows.append(seq[start:end])
    else:
        windows.append("Sequence not found")

df['SequenceWindow'] = windows
df.to_csv("phospho_with_sequence_windows.csv", index=False)
print("✅ Done! Saved to phospho_with_sequence_windows.csv")


Processing:  40%|█████████████████████████▏                                     | 6259/15685 [29:37<1:06:12,  2.37it/s]

HTTP error 400 for 36616
HTTP error 400 for 36616
HTTP error 400 for 36616


Processing:  72%|██████████████████████████████████████████████                  | 11293/15685 [53:00<18:10,  4.03it/s]

HTTP error 404 for Q91V81-2
HTTP error 404 for Q91V81-2
HTTP error 404 for Q91V81-2


Processing: 100%|██████████████████████████████████████████████████████████████| 15685/15685 [1:15:11<00:00,  3.48it/s]


✅ Done! Saved to phospho_with_sequence_windows.csv


In [14]:
# Define stricter regex to check only the central site (PKA motif around phosphosite)
# Expect motif: R-R/K-X-S/T
pka_regex = re.compile(r"[RK][RK].{0,1}[ST][^P]?")

# Function to check only the central S/T for PKA motif
def is_pka_motif_centered(seq):
    if isinstance(seq, str) and len(seq) >= 21:
        center_index = 10  # 0-based index for the central residue
        # Extract 4 aa before and 3 after: total 8 aa around the phosphosite
        sub = seq[center_index - 4: center_index + 4]

        # Ensure the center is S or T (the phospho site)
        if seq[center_index] in ['S', 'T']:
            # Check if this window matches the PKA motif
            return "Yes" if pka_regex.search(sub) else "No"
    return "No"

# Apply to DataFrame
df['IsPKASite'] = df['SequenceWindow'].apply(is_pka_motif_centered)


In [16]:
df.to_csv("phospho_with_pka_annotation.csv", index=False)