In [2]:
import pandas as pd

with open("./data/protein_abundance.tsv", "r") as h:
    prot_abundance = {}
    for line in h.readlines():
        if line.startswith("ID"): continue
        prot_abundance[line.split()[0]] = float(line.split()[1])

prot_abundance


{'3055.EDO98358': 18111.0,
 '3055.EDP00188': 12748.0,
 '3055.EDP02473': 12589.0,
 '3055.EDP02283': 12246.0,
 '3055.DAA00950': 10900.0,
 '3055.EDO96901': 10432.0,
 '3055.EDP03051': 10367.0,
 '3055.EDP04241': 8524.0,
 '3055.EDP03062': 8208.0,
 '3055.EDO95759': 8103.0,
 '3055.EDP01264': 7911.0,
 '3055.EDP03517': 7794.0,
 '3055.EDP01611': 7589.0,
 '3055.EDP02901': 7405.0,
 '3055.DAA00941': 6958.0,
 '3055.EDP08545': 6847.0,
 '3055.EDP00448': 6691.0,
 '3055.DAA01471': 6369.0,
 '3055.EDO96058': 6359.0,
 '3055.EDO99452': 6242.0,
 '3055.EDO96805': 6129.0,
 '3055.EDP04043': 6114.0,
 '3055.DAA00966': 5887.0,
 '3055.DAA00933': 5631.0,
 '3055.EDO98566': 5314.0,
 '3055.EDO99874': 5173.0,
 '3055.EDO96390': 5008.0,
 '3055.EDO97626': 4962.0,
 '3055.DAA00958': 4953.0,
 '3055.DAA00957': 4544.0,
 '3055.EDP01593': 4516.0,
 '3055.DAA00964': 4485.0,
 '3055.EDP01838': 4352.0,
 '3055.EDO98586': 4270.0,
 '3055.EDO98239': 4258.0,
 '3055.DAA00967': 4171.0,
 '3055.DAA00904': 4105.0,
 '3055.EDP01087': 4086.0,
 '305

In [3]:
import requests

def convert_to_chlamydomonas_id(paxdb_id):
    url = f"https://rest.uniprot.org/uniprotkb/search?fields=xref_gramene&format=tsv&query=%28{paxdb_id}%29&size=500"
    response = requests.get(url)
    
    if response.status_code == 200:
        gram = response.text.split("\n")[1].replace(";", "")
        return gram
    
    else:
        print(f"Error retrieving UniProt entry for PaxDB ID {paxdb_id}")
        return None
    

convert_to_chlamydomonas_id("3055.EDO98358")

'PNW89085'

In [4]:

mRNA_list = []
prot_list = []

for i in prot_abundance.keys():
    paxdb_id = i
    chlamy_id = convert_to_chlamydomonas_id(paxdb_id)
    prot = prot_abundance[i]
    
    print(chlamy_id, prot)
    
    if chlamy_id: 
        mRNA_list.append(chlamy_id)
        prot_list.append(prot)


PNW89085 18111.0
PNW84742 12748.0
 12589.0
 12246.0
 10900.0
 10432.0
PNW76070 10367.0
PNW84147 8524.0


KeyboardInterrupt: 

In [5]:
with open("C:\CRAGjobs\YangLabIntern\codon_optimization\pytorch-transformer\data\idmapping_2023_07_20.tsv", "r") as h:
    mydic = {}
    for line in h.readlines():
        if line.startswith("From"): continue
        if len(line.split()) < 3: continue
        mydic[line.split()[0]] = line.split()[2].replace(";", "")

mapping = pd.DataFrame.from_dict(mydic, orient="index")
mapping.columns =  ["Gramene"]
mapping


Unnamed: 0,Gramene
3055.EDO98358,PNW89085
3055.EDP00188,PNW84742
3055.EDP03051,PNW76070
3055.EDP04241,PNW84147
3055.EDP03062,PNW76019
...,...
3055.EDO98310,PNW89074
3055.EDO96340,PNW70381
3055.EDP05833,PNW77460
3055.EDP04993,PNW83014


In [6]:

abundance_values = pd.DataFrame.from_dict(prot_abundance, orient="index")
abundance_values.columns = ["Abundance"]
abundance_values

Unnamed: 0,Abundance
3055.EDO98358,18111.00
3055.EDP00188,12748.00
3055.EDP02473,12589.00
3055.EDP02283,12246.00
3055.DAA00950,10900.00
...,...
3055.EDP04993,0.02
3055.EDP08552,0.02
3055.EDP05984,0.02
3055.EDO98570,0.02


In [13]:
combined = mapping.merge(abundance_values, left_index=True, right_index=True)
combined

Unnamed: 0,Gramene,Abundance
3055.EDO98358,PNW89085,18111.00
3055.EDP00188,PNW84742,12748.00
3055.EDP03051,PNW76070,10367.00
3055.EDP04241,PNW84147,8524.00
3055.EDP03062,PNW76019,8208.00
...,...,...
3055.EDO98310,PNW89074,0.12
3055.EDO96340,PNW70381,0.08
3055.EDP05833,PNW77460,0.03
3055.EDP04993,PNW83014,0.02


In [8]:
with open("C:\CRAGjobs\YangLabIntern\codon_optimization\pytorch-transformer\data\chlamydomonas_mRNA.fa", "r") as h:
    chlamy_mRNA = {}
    for line in h.readlines():
        if line.startswith(">"):
            key = line.split()[0].replace(">", "")
            chlamy_mRNA[key] = ""
            # print(key)
        else:
            chlamy_mRNA[key] += line.strip()
            # print(line.strip())

chlamy = pd.DataFrame.from_dict(chlamy_mRNA, orient="index")
chlamy.columns = ["mRNA"]
chlamy

Unnamed: 0,mRNA
PNW88964,ATGCCGATCAACAGGCAGAAAATCCTTCAGCTGGCCAGCAGCTTCC...
PNW89007,ATGAACGTGAAACGCCGCAAAGGCCCGTTGGTTGAGGCCTTGAGAG...
PNW89008,ATGAACGTGAAACGCCGCAAAGGCCCGTTGGTTGAGGCCTTGAGAG...
PNW87871,ATGGCAATCGTGTACGACGACACCCTGCTCGCCGCCGACTGGGTTG...
PNW88663,ATGGACCTAAATGTTGCCTTGATAGCTTGTGTGTTCCTGGTCATTG...
...,...
PNW69577,GCAGTCGCTGCTGCGGCGGCGGCGGCAGGGATAGCGGGGCTTCCCG...
PNW69575,ATGGCAGGCACATTTAAGAAGTTCTCTAAAGAGGACGTGAGCAATC...
PNW69574,ATGGCAGGCCCGGGCGGCGCGGGCGGAGGTGCTCCCTCCATGGCCG...
PNW69573,ATGGGCTGCAGCAGCAGTAAGCCCGGCGTTGTGCCGGTGCACGACT...


In [15]:
combined2 = combined.merge(chlamy, left_on="Gramene", right_index=True)
combined2

Unnamed: 0,Gramene,Abundance,mRNA
3055.EDO98358,PNW89085,18111.00,ATGGCCTTCGCCCTTGCCAAGTCCTCCGCTCGCGCCGCGGTGTCTC...
3055.EDP00188,PNW84742,12748.00,ATGATGCTGTCTCGCACCGTCGTCAACGTTCAGGCCAAGCTCACCA...
3055.EDP03051,PNW76070,10367.00,ATGGCCGCCATCATGAAGTCCTCCGTCCGCAGCTCCGTGCGCTCCA...
3055.EDP04241,PNW84147,8524.00,ATGGCGCGTACTGGCGCTCTACTCCTGGTCGCGCTGGCGCTTGCGG...
3055.EDP03062,PNW76019,8208.00,ATGGCGACCGCTCTGTGCAACAAGGCCTTCGCTGCCGCCCCCGTGG...
...,...,...,...
3055.EDO98310,PNW89074,0.12,ATGCCTGGCAAAAAACGAATTCGCAAGCCGAAGCATCGGCAGTCGC...
3055.EDO96340,PNW70381,0.08,ATGAACCTCAATGAGCGGTCAGAGGACGACTGCAAGGACTTACCGG...
3055.EDP05833,PNW77460,0.03,ATGCAACTGATGCTGGCCAAACCAAACTGGAGCGTGGCGCGGATCC...
3055.EDP04993,PNW83014,0.02,ATGTCGAGGCGGCGAATGGGCGGCGGGGGGGCTGGTGGGCACGCCG...


In [16]:
mRNA_list = list(combined2.mRNA)
abundance_list = list(combined2.Abundance)

In [19]:
with open("mrna_list.txt", "x") as h:
    for m in mRNA_list:
        h.write(m + '\n')

with open("abundance_list.txt", "x") as h:
    for a in abundance_list:
        h.write(str(a) + '\n')