In [None]:
import requests
import math

def get_protein_coding_transcripts(gene_id):
    server = "https://rest.ensembl.org"
    ext = f"/lookup/id/{gene_id}?expand=1"

    headers = {
        "Content-Type": "application/json"
    }

    response = requests.get(server + ext, headers=headers)

    if not response.ok:
        response.raise_for_status()
        return

    decoded = response.json()

    if 'Transcript' not in decoded:
        print("No transcripts found for this gene.")
        return

    protein_coding_transcripts = []

    for transcript in decoded['Transcript']:
        if transcript['biotype'] == 'protein_coding':
            protein_coding_transcripts.append(transcript['id'])

    return protein_coding_transcripts

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    headers = {
        "Content-Type": "application/json"
    }

    response = requests.get(url, headers=headers)

    if not response.ok:
        response.raise_for_status()

    data = response.json()

    if "Translation" in data:
        protein_id = data["Translation"]["id"]
    return protein_id

# Function to get protein coding domains with Superfamily IDs
def get_protein_coding_domains(protein_id):
    server = "https://rest.ensembl.org"
    ext = f"/overlap/translation/{protein_id}?type=Superfamily"
    headers = {"Content-Type": "application/json"}

    response = requests.get(server + ext, headers=headers)

    if not response.ok:
        print(f"Error {response.status_code}: {response.reason}")
        return None

    data = response.json()

    # Extract start and end positions and Superfamily IDs
    positions = [(item['start'], item['end'], item['id']) for item in data]

    # Sort positions by start value
    positions.sort(key=lambda x: x[0])

    return positions

# Function to get CDS positions
def get_cds_positions(transcript_id):
    server = "http://rest.ensembl.org"
    ext = f"/overlap/id/{transcript_id}?feature=cds"
    headers = {"Content-Type": "application/json"}

    response = requests.get(server + ext, headers=headers)
    if not response.ok:
        response.raise_for_status()

    cds_positions = []
    for entry in response.json():
        if entry['feature_type'] == 'cds' and entry['Parent'] == transcript_id:
            start = entry['start']
            end = entry['end']
            length = end - start + 1
            cds_positions.append({'start': start, 'end': end, 'length': length})

    return cds_positions

# Custom rounding function
def custom_round(value):
    decimal_part = value - int(value)
    if decimal_part < 0.5:
        return math.floor(value)
    else:
        return math.ceil(value)

# Function to get rounded CDS positions
def get_rounded_cds_positions(cds_positions):
    rounded_cds_positions = []
    current_start = 1  # Start from 1

    for cds in cds_positions:
        length = cds['length']
        rounded_length = custom_round(length / 3)
        new_end = current_start + rounded_length - 1
        rounded_cds_positions.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1

    return rounded_cds_positions

def compare_cds_with_domains(cds_positions, protein_positions, file):
    results = []
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        for j, (y1, y2, domain_id) in enumerate(protein_positions):
            #Вот этот кусок надо поменять и подумать после какого момента мы считаем что экзон входит или не входит в домен
            if x1 > y1 and x1 - y1 < 120:
                x1 = y1
            if x2 > y2 and x2 - y2 < 120:
                x2 = y2
            if x1 < y1 and y1 - x1 < 120:
                x1 = y1
            if x2 < y2 and y2 - x2 < 120:
                x2 = y2
        for j, (y1, y2, superfamily_id) in enumerate(protein_positions):
            if (y1 > x2) or (y2 < x1):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 1 (END)\n")
            elif (x1 < y2 and x2 > y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 3 (DXE)\n")
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 4 (EID)\n")
            elif (y1 < x2 and x2 < y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 2 (EXD)\n")
            elif (y1 > x1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 5 (DIE)\n")

# Function to combine both protein domains and CDS positions
def get_protein_and_cds_info(gene_id, transcript_id, protein_id, file_path):
    protein_positions = get_protein_coding_domains(protein_id)
    cds_positions = get_cds_positions(transcript_id)

    with open(file_path, 'a') as file:
        file.write(f"Gene ID: {gene_id}, Transcript ID: {transcript_id}\n")
        if protein_positions:
            file.write("Protein domains:\n")
            for start, end, superfamily_id in protein_positions:
                file.write(f"Superfamily ID: {superfamily_id}, Start: {start}, End: {end}\n")
        else:
            file.write("No protein coding domains found.\n")

        if cds_positions:
            rounded_cds_positions = get_rounded_cds_positions(cds_positions)
            compare_cds_with_domains(rounded_cds_positions, protein_positions, file)
        else:
            file.write("No CDS positions found.\n")

In [None]:
def parse_file(file):
    # Словарь для хранения списков по статусам
    status_dict = {
        "1 (END)": [],
        "2 (EXD)": [],
        "3 (DXE)": [],
        "4 (EID)": [],
        "5 (DIE)": []
    }

    # Чтение и обработка файла
    for line in file:
        parts = line.strip().split()
        if len(parts) >= 2:
            # Определение статуса
            status = parts[-2] + ' ' + parts[-1]
            if status in status_dict:
                status_dict[status].append(line.strip())

    return status_dict

def print_sorted_dict(file_path):
    with open(file_path, 'r') as file:
      status_dict = parse_file(file)

    status_order = ["1 (END)", "2 (EXD)", "3 (DXE)", "4 (EID)", "5 (DIE)"]

    for status in status_order:
        if status in status_dict:
            print(f"\nStatus: {status}")
            for item in status_dict[status]:
                print(item)

In [None]:
def coordinate_domains_with_repeats(gene_id, protein_id, file_path):
  # Lists of ALPHA and BETA Superfamily IDs
  alpha_superfamilies = {"SSF48371", "SSF48452", "SSF74788", "SSF48445", "SSF48403", "SSF140860"}
  beta_superfamilies = {
      "SSF50923", "SSF101898", "SSF69322", "SSF117281", "SSF63825", "SSF63829",
      "SSF50939", "SSF46955", "SSF101912", "SSF69687", "SSF69322", "SSF117289",
      "SSF75011", "SSF74650", "SSF50969", "SSF50974", "SSF50978", "SSF50985",
      "SSF50989", "SSF50993", "SSF69340", "SSF82171", "SSF50998"
  }

  domains = get_protein_coding_domains(protein_id)
  # Initialize sets for ALPHA, BETA, and both
  only_alpha = set()
  only_beta = set()
  both_alpha_beta = set()
  neither_alpha_beta = set()

  # Compare and categorize domains
  if len(domains) != 0:
      for start, end, superfamily_id in domains:
          if superfamily_id in alpha_superfamilies and superfamily_id in beta_superfamilies:
              both_alpha_beta.add(superfamily_id)
          elif superfamily_id in alpha_superfamilies:
              only_alpha.add(superfamily_id)
          elif superfamily_id in beta_superfamilies:
              only_beta.add(superfamily_id)
          else:
              neither_alpha_beta.add(superfamily_id)

  # Determine if both ALPHA and BETA are non-empty
  if only_alpha and only_beta:
      both_alpha_beta.update(only_alpha)
      both_alpha_beta.update(only_beta)
      only_alpha.clear()
      only_beta.clear()

  # Prepare the results in the required format
  result_alpha = "ALPHA:" if not only_alpha else f"Only in ALPHA Superfamily IDs: {gene_id},{protein_id},{len(only_alpha)}," + ",".join(only_alpha)
  result_beta = "BETA:" if not only_beta else f"Only in BETA Superfamily IDs: {gene_id},{protein_id},{len(only_beta)}," + ",".join(only_beta)
  result_both = f"ALPHA and BETA: {gene_id},{protein_id},{len(both_alpha_beta)}," + ",".join(both_alpha_beta) if both_alpha_beta else ""
  result_neither = "Neither ALPHA and BETA:" if not neither_alpha_beta else f"Neither ALPHA and BETA:{gene_id},{protein_id},{len(neither_alpha_beta)}," + ",".join(neither_alpha_beta)


  with open(file_path, "a") as result_file:
    if result_alpha:
      result_file.write(result_alpha)
      result_file.write('\n')
    elif result_beta:
      result_file.write(result_beta)
      result_file.write('\n')
    elif result_both:
      result_file.write(result_both)
      result_file.write('\n')
    elif result_neither:
      result_file.write(result_neither)
      result_file.write('\n')
    elif len(domains) == 0:
      result_file.write(f"no domains found or an error occured for {protein_id}")
      result_file.write('\n')

In [None]:
with open(f"genes_ens.txt", "r") as genes:
  gene_ids = genes.readlines()
  for i in range(len(gene_ids)):
    gene_id = gene_ids[i]
    if "\n" in gene_id:
      gene_id = gene_id.replace("\n", "")
    transcript_ids = get_protein_coding_transcripts(gene_id)
    for transcript_id in transcript_ids:
      protein_id = get_protein_id(transcript_id)
      coordinate_domains_with_repeats(gene_id, protein_id, "result_1.txt")

FileNotFoundError: [Errno 2] No such file or directory: 'genes_ens.txt'

In [None]:
#inesa
import csv

def save_pdb_and_superfamily_ids(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as csv_file, \
         open(output_file, mode='w', encoding='utf-8') as out_file:
        csv_reader = csv.reader(csv_file)
        csv_writer = csv.writer(out_file, delimiter=' ')
        next(csv_reader)
        for row in csv_reader:
            if len(row) > 14:
                pdb_id = row[2].strip()
                sf_id = row[14].strip()[3:]
                csv_writer.writerow([pdb_id, sf_id])

input_file = 'repeats_data_scop-1.csv'
output_file = 'pdb_and_superfamily_ids.txt'

save_pdb_and_superfamily_ids(input_file, output_file)

print(f"Данные сохранены в файл {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: 'repeats_data_scop-1.csv'

In [None]:
def clean_superfamily_ids(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as in_file, \
         open(output_file, mode='w', encoding='utf-8') as out_file:
        for line in in_file:
            cleaned_id = line.strip().replace("SF=", "")
            if cleaned_id:
                out_file.write(f"{cleaned_id}\n")

input_file = 'only_reps_sf_ids.txt'
output_file = 'cleaned_reps_sf_ids.txt'

clean_superfamily_ids(input_file, output_file)
print(f"Очищенные SuperFamily IDs сохранены в файл {output_file}")

In [None]:
def clean_superfamily_ids(input_file, output_file):
    with open(input_file, mode='r', encoding='utf-8') as in_file, \
         open(output_file, mode='w', encoding='utf-8') as out_file:
        for line in in_file:
            cleaned_id = line.strip().replace("SF=", "")
            if cleaned_id:
                out_file.write(f"{cleaned_id}\n")

def filter_matching_lines(cleaned_sf_file, pdb_sf_file, output_file):
    with open(cleaned_sf_file, mode='r', encoding='utf-8') as sf_file:
        superfamily_ids = set(line.strip() for line in sf_file if line.strip())

    with open(pdb_sf_file, mode='r', encoding='utf-8') as pdb_file, \
         open(output_file, mode='w', encoding='utf-8') as out_file:
        for line in pdb_file:
            parts = line.strip().split()
            if len(parts) == 2:
                pdb_id, sf_id = parts
                if sf_id in superfamily_ids:
                    out_file.write(f"{line.strip()}\n")

input_file = 'only_reps_sf_ids.txt'
cleaned_sf_file = 'cleaned_reps_sf_ids.txt'
pdb_sf_file = 'pdb_and_superfamily_ids.txt'
output_file = 'matched_pdb_sf_ids.txt'

clean_superfamily_ids(input_file, cleaned_sf_file)
filter_matching_lines(cleaned_sf_file, pdb_sf_file, output_file)

print(f"Очищенные SuperFamily IDs сохранены в файл {cleaned_sf_file}")
print(f"Совпадающие строки сохранены в файл {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'only_reps_sf_ids.txt'

In [None]:
def filter_matching_pdb_ids(matched_pdb_file, repeats_file, output_file):
    with open(repeats_file, mode='r', encoding='utf-8') as repeats:
        pdb_ids = set(line.strip() for line in repeats if line.strip())

    with open(matched_pdb_file, mode='r', encoding='utf-8') as matched, \
         open(output_file, mode='w', encoding='utf-8') as out_file:
        for line in matched:
            parts = line.strip().split()
            if parts and parts[0] in pdb_ids:
                out_file.write(f"{parts[0]}\n")

matched_pdb_file = 'matched_pdb_sf_ids.txt'
repeats_file = 'repeats.txt'
output_file = 'common_pdb_sf_ids.txt'

filter_matching_pdb_ids(matched_pdb_file, repeats_file, output_file)

print(f"Совпадающие PDB IDs сохранены в файл {output_file}")



FileNotFoundError: [Errno 2] No such file or directory: 'repeats.txt'

In [None]:
#толка для чиловека
import requests

def pdb_to_ensembl(input_file, output_file):
    ensembl_url = "https://rest.ensembl.org/xrefs/symbol/homo_sapiens"

    with open(input_file, mode='r', encoding='utf-8') as infile, \
         open(output_file, mode='w', encoding='utf-8') as outfile:

        for line in infile:
            pdb_id = line.strip()
            if not pdb_id:
                continue

            response = requests.get(f"{ensembl_url}/{pdb_id}", headers={"Content-Type": "application/json"})

            if response.status_code == 200:
                data = response.json()
                if data:
                    for entry in data:
                        ensembl_id = entry.get("id")
                        if ensembl_id:
                            outfile.write(f"{pdb_id}\t{ensembl_id}\n")
                            break
                else:
                    outfile.write(f"{pdb_id}\tNOT_FOUND\n")
            else:
                print(f"Ошибка для PDB ID {pdb_id}: {response.status_code}")
                outfile.write(f"{pdb_id}\tERROR\n")

input_file = 'common_pdb_sf_ids.txt'
output_file = 'ensembl_ids.txt'

pdb_to_ensembl(input_file, output_file)
print(f"Преобразование завершено. Результаты сохранены в файл {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: 'common_pdb_sf_ids.txt'

In [None]:
#для всеж
def pdb_to_ensembl_multi_species(input_file, output_file):
    ensembl_url = "https://rest.ensembl.org/xrefs/symbol"
    organisms = ["homo_sapiens", "mus_musculus", "danio_rerio"]

    with open(input_file, mode='r', encoding='utf-8') as infile, \
         open(output_file, mode='w', encoding='utf-8') as outfile:

        for line in infile:
            pdb_id = line.strip()
            if not pdb_id:
                continue

            found = False
            for organism in organisms:
                response = requests.get(f"{ensembl_url}/{organism}/{pdb_id}", headers={"Content-Type": "application/json"})
                if response.status_code == 200:
                    data = response.json()
                    if data:
                        for entry in data:
                            ensembl_id = entry.get("id")
                            if ensembl_id:
                                outfile.write(f"{pdb_id}\t{ensembl_id}\t{organism}\n")
                                found = True
                                break
                if found:
                    break

            if not found:
                outfile.write(f"{pdb_id}\tNOT_FOUND\n")

input_file = 'common_pdb_sf_ids.txt'
outputtt_file = 'ensembl_ids_multi_species.txt'

pdb_to_ensembl_multi_species(input_file, outputtt_file)
print(f"Преобразование завершено. Результаты сохранены в файл {outputtt_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'common_pdb_sf_ids.txt'

In [None]:
#примерный план того что надо сделать с данными в этом коде

folder_with_tables_with_ensp_ids = ""

import pandas as pd

df = pd.read_csv("sf_id_table.csv")

proteins = list(df['Protein_ID'])

for i in range(len(proteins)):
  prot_id = gene_ids[i] #я туту уже не буду исправлять потому что и так понятно
  if "\n" in gene_id:
    gene_id = gene_id.replace("\n", "")
  transcript_ids = get_protein_coding_transcripts(gene_id)
  for transcript_id in transcript_ids:
    protein_id = get_protein_id(transcript_id)
    coordinate_domains_with_repeats(gene_id, protein_id, "result_1.txt")

#потенциальные проблемы этого скрипта:
#надо получить айдишник гена для того чтобы коодрдинаты можно было заметчить
#по идее когда они есть можно просто запустить эту ячейку

FileNotFoundError: [Errno 2] No such file or directory: 'sf_id_table.csv'

In [None]:
import zipfile
import os
import requests
import math

def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def create_zip(directory, zip_name):
    with zipfile.ZipFile(zip_name, 'w') as zipf:
        for root, _, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=file)

def process_files(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        if os.path.isfile(input_path):
            with open(output_path, 'w') as file:
                gene_id = filename.split('.')[0]  # Предположим, что ID гена - имя файла без расширения
                transcripts = get_protein_coding_transcripts(gene_id)
                if transcripts:
                    for transcript in transcripts:
                        protein_id = get_protein_id(transcript)
                        if protein_id:
                            get_protein_and_cds_info(gene_id, transcript, protein_id, output_path)

def get_protein_coding_transcripts(gene_id):
    server = "https://rest.ensembl.org"
    ext = f"/lookup/id/{gene_id}?expand=1"
    headers = {"Content-Type": "application/json"}
    response = requests.get(server + ext, headers=headers)
    if not response.ok:
        return []
    decoded = response.json()
    return [t['id'] for t in decoded.get('Transcript', []) if t['biotype'] == 'protein_coding']

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=0"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)
    if not response.ok:
        return None
    data = response.json()
    return data.get("Translation", {}).get("id")

def get_protein_and_cds_info(gene_id, transcript_id, protein_id, file_path):
    with open(file_path, 'a') as file:
        file.write(f"Gene ID: {gene_id}, Transcript ID: {transcript_id}, Protein ID: {protein_id}\n")

# Пути
input_zip = "processed_tables.zip"
extracted_dir = "extracted_tables"
output_dir = "processed_results"
output_zip = "processed_results.zip"

# Выполнение
extract_zip(input_zip, extracted_dir)
process_files(extracted_dir, output_dir)
create_zip(output_dir, output_zip)

print(f"Обработка завершена. Результаты в {output_zip}")

FileNotFoundError: [Errno 2] No such file or directory: 'processed_tables.zip'

In [None]:
import requests
import math
import zipfile
import csv
import os
import pandas as pd

# def get_transcripts_from_protein(ensp_id):
#     url = f"https://rest.ensembl.org/overlap/id/{ensp_id}?feature=transcript"
#     headers = {"Content-Type": "application/json"}
#     response = requests.get(url, headers=headers)

#     if not response.ok:
#         response.raise_for_status()
#         return []

#     data = response.json()
#     transcripts = [entry['id'] for entry in data if entry['feature_type'] == 'transcript']
#     return transcripts

import requests, sys

def get_transcripts_from_protein(ensp_id):
  transcripts = ensp_id.replace("P", "T")
  return transcripts

def get_protein_coding_transcripts(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)

    if not response.ok:
        response.raise_for_status()
        return []

    data = response.json()
    if data.get('biotype') == 'protein_coding':
        return [transcript_id]
    return []

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)

    if not response.ok:
        response.raise_for_status()
        return None

    data = response.json()
    return data.get("Translation", {}).get("id")

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)

    if not response.ok:
        response.raise_for_status()
        return []

    data = response.json()
    return [(item['start'], item['end'], item['id']) for item in data]

def get_cds_positions(transcript_id):
    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
    headers = {"Content-Type": "application/json"}
    response = requests.get(url, headers=headers)

    if not response.ok:
        response.raise_for_status()
        return []

    data = response.json()
    return [{'start': entry['start'], 'end': entry['end'], 'length': entry['end'] - entry['start'] + 1} for entry in data if entry['feature_type'] == 'cds']

def custom_round(value):
    return math.floor(value) if value - int(value) < 0.5 else math.ceil(value)

def get_rounded_cds_positions(cds_positions):
    rounded_cds_positions = []
    current_start = 1
    for cds in cds_positions:
        rounded_length = custom_round(cds['length'] / 3)
        new_end = current_start + rounded_length - 1
        rounded_cds_positions.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1
    return rounded_cds_positions

def compare_cds_with_domains(cds_positions, protein_positions, file):
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        for y1, y2, superfamily_id in protein_positions:
            if (y1 > x2) or (y2 < x1):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 1 (END)\n")
            elif (x1 < y2 and x2 > y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 3 (DXE)\n")
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 4 (EID)\n")
            elif (y1 < x2 and x2 < y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 2 (EXD)\n")
            elif (y1 > x1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 5 (DIE)\n")

file_df = pd.read_csv("161219_table (1).csv")
def collect_statistics(file_df):
  temp_folder = "temp_results"
  os.makedirs(temp_folder, exist_ok=True)
  proteins = file_df['Protein_ID']
  for ensp_id in proteins:
    print(ensp_id)
    transcript = get_transcripts_from_protein(ensp_id)
    print(transcript)
    coding_transcripts = get_protein_coding_transcripts(transcript)
#     for coding_transcript in coding_transcripts:
#         protein_id = get_protein_id(coding_transcript)
#         if protein_id:
#             output_file = os.path.join(temp_folder, f"{ensp_id}.txt")
#             with open(output_file, 'w') as out_file:
#                 get_protein_and_cds_info(ensp_id, coding_transcript, protein_id, out_file)

collect_statistics(file_df)
# def process_zip():
#     input_zip = "processed_tables.zip"
#     output_zip = "processed_results.zip"
#     temp_folder = "temp_results"
#     os.makedirs(temp_folder, exist_ok=True)

#     with zipfile.ZipFile(input_zip, 'r') as archive:
#         for filename in archive.namelist():
#             if filename.endswith('.csv'):
#                 with archive.open(filename) as file:
#                     csv_reader = csv.reader(file.read().decode('utf-8').splitlines())
#                     for row in csv_reader:
#                         for ensp_id in row:
#                             transcripts = get_transcripts_from_protein(ensp_id)
#                             for transcript in transcripts:
#                                 coding_transcripts = get_protein_coding_transcripts(transcript)
#                                 for coding_transcript in coding_transcripts:
#                                     protein_id = get_protein_id(coding_transcript)
#                                     if protein_id:
#                                         output_file = os.path.join(temp_folder, f"{ensp_id}.txt")
#                                         with open(output_file, 'w') as out_file:
#                                             get_protein_and_cds_info(ensp_id, coding_transcript, protein_id, out_file)

#     with zipfile.ZipFile(output_zip, 'w') as zipf:
#         for root, _, files in os.walk(temp_folder):
#             for file in files:
#                 zipf.write(os.path.join(root, file), arcname=file)

#     print(f"Processing complete. Results saved in {output_zip}")

# process_zip()


FileNotFoundError: [Errno 2] No such file or directory: '161219_table (1).csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
# import requests
import requests, sys

def get_transcript_id(ensp_id):

    server = "https://rest.ensembl.org"
    ext = f"/overlap/translation/{ensp_id}?type=Superfamily"

    r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()

    decoded = r.json()
    print(repr(decoded[0]['Parent']))
    # transcript_ids = []
    # for entry in decoded:
    #     if 'type' in entry and entry['type'] == 'transcript':
    #         transcript_ids.append(entry['id'])

    # return transcript_ids

get_transcript_id("ENSP00000078429")

'ENST00000078429'


In [None]:
import requests, zipfile, csv

def get_transcript_id(ensp_id):
    server = "https://rest.ensembl.org"
    ext = f"/overlap/translation/{ensp_id}?type=Superfamily"
    response = requests.get(server + ext, headers={"Content-Type": "application/json"})

    if response.ok:
        decoded = response.json()
        if decoded:
            print(decoded[0].get('Parent', ''))

def process_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    for row in reader:
                        if row:
                            get_transcript_id(row[0].strip())

process_zip("processed_tables.zip")


FileNotFoundError: [Errno 2] No such file or directory: 'processed_tables.zip'

In [None]:
import requests, zipfile, csv, math

def get_transcript_id(ensp_id):
    url = f"https://rest.ensembl.org/overlap/translation/{ensp_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        if decoded:
            return decoded[0].get('Parent', '')
    return None

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get("Translation", {}).get("id", None)
    return None

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return sorted([(item['start'], item['end'], item['id']) for item in data], key=lambda x: x[0])
    return []

def get_cds_positions(transcript_id):
    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        return [{'start': entry['start'], 'end': entry['end'], 'length': entry['end'] - entry['start'] + 1}
                for entry in response.json() if entry['feature_type'] == 'cds']
    return []

def custom_round(value):
    return math.floor(value) if (value - int(value)) < 0.5 else math.ceil(value)

def get_rounded_cds_positions(cds_positions):
    rounded = []
    current_start = 1
    for cds in cds_positions:
        length = cds['length']
        rounded_length = custom_round(length / 3)
        new_end = current_start + rounded_length - 1
        rounded.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1
    return rounded

def compare_cds_with_domains(cds_positions, protein_positions, file):
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        for j, (y1, y2, domain_id) in enumerate(protein_positions):
            if x1 > y1 and x1 - y1 < 120: x1 = y1
            if x2 > y2 and x2 - y2 < 120: x2 = y2
            if x1 < y1 and y1 - x1 < 120: x1 = y1
            if x2 < y2 and y2 - x2 < 120: x2 = y2
        for j, (y1, y2, superfamily_id) in enumerate(protein_positions):
            if (y1 > x2) or (y2 < x1):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 1 (END)\n")
            elif (x1 < y2 and x2 > y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 3 (DXE)\n")
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 4 (EID)\n")
            elif (y1 < x2 and x2 < y2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 2 (EXD)\n")
            elif (y1 > x1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                file.write(f"CDS {i+1}, Superfamily {superfamily_id}, 5 (DIE)\n")

def get_protein_and_cds_info(ensp_id, file_path):
    transcript_id = get_transcript_id(ensp_id)
    if not transcript_id:
        return
    protein_id = get_protein_id(transcript_id)
    if not protein_id:
        return
    protein_positions = get_protein_coding_domains(protein_id)
    cds_positions = get_cds_positions(transcript_id)

    with open(file_path, 'a') as file:
        file.write(f"Protein ID: {ensp_id}, Transcript ID: {transcript_id}\n")
        if protein_positions:
            file.write("Protein domains:\n")
            for start, end, superfamily_id in protein_positions:
                file.write(f"Superfamily ID: {superfamily_id}, Start: {start}, End: {end}\n")
        else:
            file.write("No protein coding domains found.\n")

        if cds_positions:
            rounded_cds_positions = get_rounded_cds_positions(cds_positions)
            compare_cds_with_domains(rounded_cds_positions, protein_positions, file)
        else:
            file.write("No CDS positions found.\n")

def process_zip(zip_path, output_file):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    for row in reader:
                        if row:
                            get_protein_and_cds_info(row[0].strip(), output_file)

process_zip("processed_tables.zip", "output.txt")

FileNotFoundError: [Errno 2] No such file or directory: 'processed_tables.zip'

In [None]:
#new
import requests, zipfile, csv, math
import time



def current_milli_time():
    return round(time.time() * 1000)

def printTime():
  print(endTime - startTime)

def get_transcript_id(ensp_id):
    url = f"https://rest.ensembl.org/overlap/translation/{ensp_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        if decoded:
            return decoded[0].get('Parent', '')
    return None

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get("Translation", {}).get("id", None)
    return None

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return sorted([(item['start'], item['end'], item['id']) for item in data], key=lambda x: x[0])
    return []

def get_cds_positions(transcript_id):
    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        return [{'start': entry['start'], 'end': entry['end'], 'length': entry['end'] - entry['start'] + 1}
                for entry in response.json() if entry['feature_type'] == 'cds']
    return []

def custom_round(value):
    return math.floor(value) if (value - int(value)) < 0.5 else math.ceil(value)

def get_rounded_cds_positions(cds_positions):
    rounded = []
    current_start = 1
    for cds in cds_positions:
        length = cds['length']
        rounded_length = custom_round(length / 3)
        new_end = current_start + rounded_length - 1
        rounded.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1
    return rounded

def compare_cds_with_domains(cds_positions, protein_positions, file):
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        cds_len = abs(x1-x2)
        print("length", cds_len)
        for j, (y1, y2, domain_id) in enumerate(protein_positions):
          if abs(x1 - y1) < 120: x1 = y1
          if abs(x2 - y2) < 120: x2 = y2
        for j, (y1, y2, superfamily_id) in enumerate(protein_positions):
            if (y1 > x2) or (y2 < x1):
                label = "1 (END)"
            elif (x1 < y2 and x2 > y2):
                label = "3 (DXE)"
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                label = "4 (EID)"
            elif (y1 < x2 and x2 < y2):
                label = "2 (EXD)"
            elif (y1 > x1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                label = "5 (DIE)"
            else:
              continue
            file.write(f"CDS {i+1}, Superfamily {superfamily_id}, {label}\n")
            endTime = current_milli_time()
            # printTime()

def get_protein_and_cds_info(ensp_id, file_path):
    transcript_id = get_transcript_id(ensp_id)
    if not transcript_id:
        print("not transcript_id")
        return
    protein_id = get_protein_id(transcript_id)
    if not protein_id:
        print("not protein_id")
        return
    protein_positions = get_protein_coding_domains(protein_id)
    cds_positions = get_cds_positions(transcript_id)

    with open(file_path, 'a') as file:
        file.write(f"Protein ID: {ensp_id}, Transcript ID: {transcript_id}\n")
        if protein_positions:
            file.write("Protein domains:\n")
            for start, end, superfamily_id in protein_positions:
                file.write(f"Superfamily ID: {superfamily_id}, Start: {start}, End: {end}\n")
        else:
            file.write("No protein coding domains found.\n")

        if cds_positions:
            rounded_cds_positions = get_rounded_cds_positions(cds_positions)
            compare_cds_with_domains(rounded_cds_positions, protein_positions, file)
        else:
            file.write("No CDS positions found.\n")

def process_zip(zip_path, output_file):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    for row in reader:
                        if row:
                            print(row[0])
                            startTime = current_milli_time()
                            get_protein_and_cds_info(row[0].strip(), output_file)
                            timePassed =  current_milli_time() - startTime
                            timeArr.append(timePassed)
                            print(timePassed)

timeArr = []
StartTimeAll = current_milli_time()
def calculateAvgTime():
  total = 0
  for timePass in timeArr:
    total += timePass
  return total/len(timeArr)
try:
    process_zip("processed_tables (2).zip", "outputL.txt")
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)
except KeyboardInterrupt:
    # Cleanup/exiting code
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)



ENSP00000078429
length 44
length 61
length 51
length 42
length 42
length 50
length 63
length 6
length 42
length 42
length 50
length 61
length 29
length 41
length 29
2327
ENSP00000232461
length 34
length 13
length 46
length 52
length 42
length 42
length 50
length 63
length 34
length 13
length 46
length 52
length 42
length 42
length 50
length 63
length 1
length 46
length 52
length 27
2431
ENSP00000251337
length 38
length 13
length 46
length 52
length 42
length 42
length 50
length 63
length 38
length 13
length 46
length 52
length 42
length 42
length 50
length 63
length 38
length 13
length 46
length 22
2305
ENSP00000262493
length 38
length 13
length 56
length 38
length 13
length 46
length 53
length 42
length 42
length 50
length 62
length 38
length 13
length 46
length 53
length 42
length 42
length 30
length 38
length 13
length 46
length 53
length 42
length 42
length 50
length 62
length 38
length 13
length 46
length 53
length 42
length 42
length 50
length 62
length 38
length 15
length 38
len

In [None]:
#рабочая ячейка для дописывания, не менять верхнюю
import requests, zipfile, csv, math
import time



def current_milli_time():
    return round(time.time() * 1000)

def printTime():
  print(endTime - startTime)

def get_transcript_id(ensp_id):
    url = f"https://rest.ensembl.org/overlap/translation/{ensp_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        if decoded:
            return decoded[0].get('Parent', '')
    return None

def get_gene_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        if decoded:
            return decoded.get("Parent")
    return None

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get("Translation", {}).get("id", None)
    return None

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return sorted([(item['start'], item['end'], item['id']) for item in data], key=lambda x: x[0])
    return []

def get_cds_positions(transcript_id):
    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        return [{'start': entry['start'], 'end': entry['end'], 'length': entry['end'] - entry['start'] + 1}
                for entry in response.json() if entry['feature_type'] == 'cds' and entry['Parent'] == transcript_id]
    return []

def custom_round(value):
    return math.floor(value) if (value - int(value)) < 0.5 else math.ceil(value)

def get_rounded_cds_positions(cds_positions):
    rounded = []
    current_start = 1
    for cds in cds_positions:
        length = cds['length']
        rounded_length = custom_round(length / 3)
        new_end = current_start + rounded_length - 1
        rounded.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1
    return rounded

def compare_cds_with_domains(cds_positions, domain_positions, file, ssf_name):
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        cds_len = abs(x1-x2)
        for j, (y1, y2, superfamily_id) in enumerate(domain_positions):
          if superfamily_id == ssf_name:
            # if abs(x1 - y1) < cds_len*0.15: x1 = y1
            # if abs(x2 - y2) < cds_len*0.15: x2 = y2
            if (y1 > x2) or (y2 < x1):
                label = "1 (END)"
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                label = "4 (EID)"
            elif (y1 < x2 and x2 < y2):
                label = "2 (EXD)"
            elif (x1 < y1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                label = "5 (DIE)"
            elif (x1 < y2 and x2 > y2):
                label = "3 (DXE)"
            else:
              continue
            file.write(f"CDS {i+1}, cds start {x1}, cds end {x2}, Superfamily {superfamily_id}, {label}\n")
            endTime = current_milli_time()
            # printTime()

def get_protein_and_cds_info(ensp_id, file_path, ssf_name):
    transcript_id = get_transcript_id(ensp_id)
    if not transcript_id:
        print("not transcript_id")
        return
    protein_id = get_protein_id(transcript_id)
    gene_id = get_gene_id(transcript_id)
    if not protein_id:
        print("not protein_id")
        return
    domain_positions = get_protein_coding_domains(protein_id)
    cds_positions = get_cds_positions(transcript_id)
    with open(file_path, 'a') as file:
        file.write(f"Protein ID: {ensp_id}, Transcript ID: {transcript_id}, Gene ID: {gene_id}\n")
        if domain_positions:
            file.write("Protein domains:\n")
            for start, end, superfamily_id in domain_positions:
                file.write(f"Superfamily ID: {superfamily_id}, Start: {start}, End: {end}\n")
        else:
            file.write("No protein coding domains found.\n")

        if cds_positions:
            rounded_cds_positions = get_rounded_cds_positions(cds_positions)
            compare_cds_with_domains(rounded_cds_positions, domain_positions, file, ssf_name)
        else:
            file.write("No CDS positions found.\n")

def process_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    outname = filename.replace(".csv", ".txt")
                    ssf_name = "SSF" + outname.split("_")[1]
                    output_file = f"output_{outname}"
                    for row in reader:
                        if row:
                            print(row[0])
                            startTime = current_milli_time()
                            get_protein_and_cds_info(row[0].strip(), output_file, ssf_name)
                            timePassed =  current_milli_time() - startTime
                            timeArr.append(timePassed)
                            print(timePassed)

timeArr = []
StartTimeAll = current_milli_time()
def calculateAvgTime():
  total = 0
  for timePass in timeArr:
    total += timePass
  return total/len(timeArr)
try:
    process_zip("processed_tables (2).zip")
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)
except KeyboardInterrupt:
    # Cleanup/exiting code
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5594
ENSP00000401853
4979
ENSP00000401865
5201
ENSP00000401931
not transcript_id
907
ENSP00000402353
4901
ENSP00000403075
4921
ENSP00000403447
5265
ENSP00000403792
5115
ENSP00000403954
5163
ENSP00000404040
not transcript_id
920
ENSP00000404099
5331
ENSP00000404131
5261
ENSP00000404259
5155
ENSP00000404308
4788
ENSP00000404464
4863
ENSP00000405502
6268
ENSP00000405536
5523
ENSP00000405686
not transcript_id
913
ENSP00000405974
5314
ENSP00000405993
5461
ENSP00000406229
5601
ENSP00000406657
4948
ENSP00000406827
not transcript_id
947
ENSP00000407236
5647
ENSP00000407432
5046
ENSP00000408024
4903
ENSP00000408195
not transcript_id
902
ENSP00000408335
5320
ENSP00000408741
5326
ENSP00000409813
4898
ENSP00000410674
8173
ENSP00000411023
5379
ENSP00000411167
not transcript_id
928
ENSP00000411241
5226
ENSP00000411406
5756
ENSP00000411898
4818
ENSP00000412697
4611
ENSP00000412763
5584
ENSP00000413039
5096
ENSP00000414127
4706
ENSP00000

In [None]:
import requests, zipfile, csv, math
import time
import os


def current_milli_time():
    return round(time.time() * 1000)

def printTime():
  print(endTime - startTime)

def get_transcript_id(ensp_id):
    url = f"https://rest.ensembl.org/overlap/translation/{ensp_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        return decoded
    #     if decoded:
    #         return decoded[0].get('Parent', '')
    # return None

def get_gene_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        decoded = response.json()
        if decoded:
            return decoded.get("Parent")
    return None

def get_protein_id(transcript_id):
    url = f"https://rest.ensembl.org/lookup/id/{transcript_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get("Translation", {}).get("id", None)
    return None

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return sorted([(item['start'], item['end'], item['id']) for item in data], key=lambda x: x[0])
    return []

# def get_protein_coding_domains(protein_id, sff_id):
#     url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
#     response = requests.get(url, headers={"Content-Type": "application/json"})
#     if response.ok:
#         data = response.json()
#         for item in data:
            # id = item['id']
            # if id == sff_id:
            #   start_sff = item['start']
            #   end_sff = item['end']
            # elif id != sff_id:
            #   start_non_sf = item['start']
            #   end_non_sf = item['end']
            # if start_non_sf and end_non_sf:
            #   if abs(start_sff - end_sff) < abs(start_non_sf - end_non_sf):
            #     return sorted([(start_non_sf, end_non_sf, id) for item in data], key=lambda x: x[0])
            # else:
            #     return sorted([(start_sff, end_sff, id) for item in data], key=lambda x: x[0])
#           start = item['start']
#           end = item['end']
#           id = item['id']
#           if id == sff_id:
#             return sorted([(start, end, id) for item in data], key=lambda x: x[0])
#           else:

#     return []

def get_cds_positions(transcript_id):
    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        return [{'start': entry['start'], 'end': entry['end'], 'length': entry['end'] - entry['start'] + 1}
                for entry in response.json() if entry['feature_type'] == 'cds']
    return []

def custom_round(value):
    return math.floor(value) if (value - int(value)) < 0.5 else math.ceil(value)

def get_rounded_cds_positions(cds_positions):
    rounded = []
    current_start = 1
    for cds in cds_positions:
        length = cds['length']
        rounded_length = custom_round(length / 3)
        new_end = current_start + rounded_length - 1
        rounded.append({'start': current_start, 'end': new_end, 'rounded_length': rounded_length})
        current_start = new_end + 1
    return rounded

def compare_cds_with_domains(cds_positions, protein_positions, file):
    for i, cds in enumerate(cds_positions):
        x1, x2 = cds['start'], cds['end']
        cds_len = abs(x1-x2)
        print("length", cds_len)
        for j, (y1, y2, domain_id) in enumerate(protein_positions):
          if abs(x1 - y1) < 0.3*cds_len: x1 = y1 #залезшая к общей процентом (или вылезшая). по длине экзона
          if abs(x2 - y2) < 0.3*cds_len: x2 = y2
        for j, (y1, y2, superfamily_id) in enumerate(protein_positions):
            if (y1 > x2) or (y2 < x1):
                label = "1 (END)"
            elif (x1 < y2 and x2 > y2):
                label = "3 (DXE)"
            elif (y1 < x1 and x2 < y2) or (y1 == x1 and x2 < y2) or (y1 < x1 and x2 == y2):
                label = "4 (EID)"
            elif (y1 < x2 and x2 < y2):
                label = "2 (EXD)"
            elif (y1 > x1 and y2 < x2) or (y1 == x1 and y2 < x2) or (y1 > x1 and y2 == x2):
                label = "5 (DIE)"
            else:
              continue
            file.write(f"CDS {i+1}, Superfamily {superfamily_id}, {label}\n")
            endTime = current_milli_time()
            # printTime()

def get_protein_and_cds_info(ensp_id, file_path):
    transcript_id = get_transcript_id(ensp_id)
    if not transcript_id:
        print("not transcript_id", ensp_id)
        pass
    # protein_id = get_protein_id(transcript_id)
    gene_id = get_gene_id(transcript_id)
    # if not protein_id:
    #     print("not protein_id", transcript_id)
    #     return
    protein_positions = get_protein_coding_domains(ensp_id)
    cds_positions = get_cds_positions(transcript_id)

    with open(file_path, 'a') as file:
        file.write(f"Protein ID: {ensp_id}, Transcript ID: {transcript_id}, Gene ID: {gene_id}\n")
        if protein_positions:
            file.write("Protein domains:\n")
            for start, end, superfamily_id in protein_positions:
                file.write(f"Superfamily ID: {superfamily_id}, Start: {start}, End: {end}\n")
        else:
            file.write("No protein coding domains found.\n")

        if cds_positions:
            rounded_cds_positions = get_rounded_cds_positions(cds_positions)
            compare_cds_with_domains(rounded_cds_positions, protein_positions, file)
        else:
            file.write("No CDS positions found.\n")

def process_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    outname = filename.replace(".csv", ".txt")
                    ssf_name = "SSF" + outname.split("_")[1]
                    output_file = f"output_{outname}"
                    for row in reader:
                        if row:
                            print(row[0])
                            startTime = current_milli_time()
                            get_protein_and_cds_info(row[0].strip(), output_file)
                            timePassed =  current_milli_time() - startTime
                            timeArr.append(timePassed)
                            # print(timePassed)

timeArr = []
StartTimeAll = current_milli_time()
def calculateAvgTime():
  total = 0
  for timePass in timeArr:
    total += timePass
  return total/len(timeArr)
try:
    process_zip("processed_tables (2).zip")
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)
except KeyboardInterrupt:
    # Cleanup/exiting code
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)

ENSP00000078429
ENSP00000232461
AVG:
2386.0
total Estimated:
28.982609444444446
time passed:
4686


In [None]:
#ячейка для Юли
import requests, zipfile, csv, math, sys
import os
import time

def current_milli_time():
    return round(time.time() * 1000)

def get_protein_coding_domains(protein_id):
    url = f"https://rest.ensembl.org/overlap/translation/{protein_id}?type=Superfamily"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return sorted([(item['start'], item['end'], item['id']) for item in data], key=lambda x: x[0])
    return []

def get_protein_length(protein_id):

  url = f"https://rest.ensembl.org/sequence/id/{protein_id}?"

  response = requests.get(url, headers={ "Content-Type" : "application/json"})

  if response.ok:
    data = response.json()
    return len(data['seq'])

  return []

# with open("domain_length_to_protein_length.tsv", "w") as fi:
#   fi.write(f"ssf_id\tprotein_id\tpercentage_of_the_domain\tprotein_length")
#   fi.write("\n")

def process_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist()[:38]:
            print(filename)
            if filename.endswith(".csv"):
                with z.open(filename) as f:
                    reader = csv.reader(f.read().decode("utf-8").splitlines())
                    outname = filename.replace(".csv", ".txt")
                    ssf_name = "SSF" + outname.split("_")[1]
                    output_file = f"output_{outname}"
                    for row in reader:
                        protein_name = row
                        rel = 0
                        while row == protein_name:
                          startTime = current_milli_time()
                          length = get_protein_length(row[0])
                          coords = get_protein_coding_domains(row[0])
                          timePassed =  current_milli_time() - startTime
                          timeArr.append(timePassed)
                          for i in range(len(coords)):
                            if coords[i][2] != ssf_name:
                              pass
                            else:
                              relation = (coords[i][1] - coords[i][0]) / length
                              rel += relation
                            protein_name = row
                            rel = 0
                              with open("domain_length_to_protein_length.tsv", "a") as fi:
                                fi.write(f"{ssf_name}\t{row[0]}\t{relation}\t{length}")
                                fi.write("\n")

timeArr = []
StartTimeAll = current_milli_time()
def calculateAvgTime():
  total = 0
  for timePass in timeArr:
    total += timePass
  return total/len(timeArr)
try:
    process_zip("processed_tables (2).zip")
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)
except KeyboardInterrupt:
    print("AVG:")
    print(calculateAvgTime())
    print("total Estimated:")
    print(calculateAvgTime()*43729/1000/3600)
    print("time passed:")
    print(current_milli_time() - StartTimeAll)

processed_47895_table.csv
processed_54236_table.csv
processed_57196_table.csv
processed_47370_table.csv
processed_55205_table.csv
processed_81901_table.csv
processed_51092_table.csv
processed_63848_table.csv
processed_51126_table.csv
processed_51730_table.csv
processed_51569_table.csv
processed_56300_table.csv
processed_49417_table.csv
processed_51430_table.csv
processed_49265_table.csv
processed_47113_table.csv
processed_46689_table.csv
processed_81296_table.csv
processed_51679_table.csv
processed_50969_table.csv
processed_50985_table.csv
processed_51658_table.csv
processed_56496_table.csv
processed_50978_table.csv
processed_140860_table.csv
processed_50104_table.csv
processed_101898_table.csv
processed_46894_table.csv
processed_47266_table.csv
processed_47459_table.csv
processed_57501_table.csv
processed_47874_table.csv
processed_144284_table.csv
processed_51735_table.csv
processed_51177_table.csv
processed_54001_table.csv
processed_48097_table.csv
processed_50729_table.csv
AVG:


ZeroDivisionError: division by zero