In [2]:
import csv
import requests
import sys
import os
import re
import time
import itertools
import json

In [2]:
with open('mart_export.tsv', 'r') as mart, open('filtered_mart.tsv', 'w', newline='') as filtered:

    # define reader and writer objects
    reader = csv.reader(mart,delimiter="\t", skipinitialspace=True)
    writer = csv.writer(filtered, delimiter="\t")

    # iterate and write rows based on condition
    for i in reader:
        if i[4] != 'MT':
            writer.writerow(i)

In [3]:
tsv_file = open("filtered_mart.tsv")
mart_tsv = csv.reader(tsv_file, delimiter="\t", quotechar='"')

In [4]:
it1, it2 = itertools.tee(mart_tsv, 2)

In [5]:
gene_id = []
for row in it1:
    if row[0] not in gene_id: #Get unique genes only, ensure no duplicates
        gene_id.append(row[0])
print(gene_id)

['Gene stable ID', 'ENSG00000205858', 'ENSG00000145220', 'ENSG00000151789', 'ENSG00000154813', 'ENSG00000171243', 'ENSG00000164822', 'ENSG00000164821', 'ENSG00000101220', 'ENSG00000178591', 'ENSG00000110169', 'ENSG00000177706', 'ENSG00000092330', 'ENSG00000261456', 'ENSG00000125788', 'ENSG00000088782', 'ENSG00000185982', 'ENSG00000276194', 'ENSG00000125903', 'ENSG00000100109', 'ENSG00000186458', 'ENSG00000127418', 'ENSG00000110811', 'ENSG00000100104', 'ENSG00000157796', 'ENSG00000250510', 'ENSG00000275083', 'ENSG00000248767', 'ENSG00000185049', 'ENSG00000276072', 'ENSG00000101193', 'ENSG00000044012', 'ENSG00000148719', 'ENSG00000132465', 'ENSG00000110700', 'ENSG00000196476', 'ENSG00000125872', 'ENSG00000101222', 'ENSG00000197273', 'ENSG00000125817', 'ENSG00000177685', 'ENSG00000164061', 'ENSG00000132464', 'ENSG00000197183', 'ENSG00000185069', 'ENSG00000286038', 'ENSG00000187801', 'ENSG00000179796', 'ENSG00000255151', 'ENSG00000170340', 'ENSG00000137571', 'ENSG00000174799', 'ENSG0000013

In [None]:
print(len(gene_id))

In [6]:
HGNC = []
for row in it2:
    if row[9] not in HGNC: #Get unique genes only, ensure no duplicates
        HGNC.append(row[9])
print(HGNC)

['HGNC symbol', 'LRRC72', 'LYAR', 'ZNF385D', 'DPH3', 'SOSTDC1', 'DEFA6', 'DEFA4', 'C20orf27', 'DEFB125', 'HPX', 'FAM20C', 'TINF2', 'TUBB8', 'DEFB126', 'DEFB127', 'DEFB128', 'HNF1B', 'DEFB129', 'TFIP11', 'DEFB132', 'FGFRL1', 'P3H3', 'SRRD', 'WDR19', 'GPR162', 'KIR3DL2', 'FOXL3', 'NELFA', 'AATF', 'GID8', 'GUCA2B', 'DNAJB12', 'JCHAIN', 'RPS13', 'C20orf96', 'LRRN4', 'SPEF1', 'GUCA2A', 'CENPB', 'CRACR2B', 'BSN', 'ENAM', 'NOL4L', 'KRT76', 'SPDYE13', 'ZFP69B', 'LRRC3B', 'GLYATL1B', 'B3GNT2', 'SLCO5A1', 'CEP135', 'SFTPD', 'SETBP1', 'KIR3DL3', 'PRDM2', 'UCN', 'ACTR3C', 'TESK1', 'ROCK2', 'PPP1R9A', 'SKA3', 'DMTN', 'PNPLA2', 'KIR2DL4', 'TRIM54', 'ANKRD20A2P', 'HAUS6', 'KRT3', 'DBNDD2', 'RARRES2', 'MRPL57', 'ZBED6CL', 'PHF21B', 'LRRC61', 'KRT4', 'SPI1', 'TMEM38B', 'CD72', 'RBM4B', 'KIR2DL1', 'CHST15', 'MCM10', 'CFAP53', 'BEGAIN', 'NUP205', 'KIR2DL3', 'TP53TG5', 'SPDYE14', 'PLAC8', 'RBM14', 'CCDC96', 'FOXJ3', 'LYPLA2', 'TADA2B', 'SH2D4A', 'PRRT4', 'TBC1D14', 'RBM14-RBM4', 'COA5', 'PI15', 'FAM162A',

In [25]:
print(len(HGNC))

19343


In [3]:
#Gets a canonical transcipt id from the gnomad API
def get_ensembl_id(gene_name):
    query = """
    {
        gene(gene_name: "%s") {
        canonical_transcript_id
        }     
    }""" % gene_name
    res = requests.post('https://gnomad.broadinstitute.org/api', json={'query': query})
    if res.ok:
        try:
            return res.json()['data']['gene']['canonical_transcript_id']
        except:
            res = None
    else:
        try:
            res.raise_for_status()
            return "error"
        except requests.exceptions.HTTPError: #I should catch the error too or print, to find out which specific genes
            pass

#Adapted from https://github.com/xjenny2/phospho-programs/blob/master/gnomad.py

In [4]:
#Gets a protein sequence in FASTA format given the ensembl transcript ID
def get_protein_sequence(canonical_transcript):
    server = "http://grch37.rest.ensembl.org"
    ext = "/sequence/id/" + canonical_transcript + "?content-type=text/x-fasta;type=protein"
    r = requests.get(server + ext, headers={"Content-Type": "text/x-fasta"})
    time.sleep(3)
    
    if not r.ok:
        try:
            r.raise_for_status()
            return "error"
        except requests.exceptions.HTTPError: #I should catch the error too or print, to find out which specific genes
            pass
    seqlist = r.text.split("\n", 1)
    
    if len(seqlist) == 2: #A lot of these transcripts are noncoding or introns, don't know why
        sequence = seqlist[1]
        sequence = sequence.replace("\n", "")
        return sequence
    #print("Sequence found")
    else:
        print(canonical_transcript)
    

#Adapted from https://github.com/xjenny2/phospho-programs/blob/master/ensembl.py

In [23]:
canonical_transcript = []
for x in HGNC:
    ensembl_id = get_ensembl_id(x)
    canonical_transcript.append(ensembl_id)
print(canonical_transcript)

[None, 'ENST00000401542', 'ENST00000343470', 'ENST00000281523', 'ENST00000488423', 'ENST00000307068', 'ENST00000297436', 'ENST00000297435', 'ENST00000217195', 'ENST00000382410', 'ENST00000265983', 'ENST00000313766', 'ENST00000267415', 'ENST00000309812', 'ENST00000382398', 'ENST00000382388', 'ENST00000334391', 'ENST00000225893', 'ENST00000246105', 'ENST00000407690', 'ENST00000382376', 'ENST00000398484', 'ENST00000396725', 'ENST00000215917', 'ENST00000399820', 'ENST00000311268', 'ENST00000326321', 'ENST00000510017', 'ENST00000382882', 'ENST00000225402', 'ENST00000266069', 'ENST00000372581', 'ENST00000338820', 'ENST00000254801', 'ENST00000525634', 'ENST00000360321', 'ENST00000378858', 'ENST00000379756', 'ENST00000357001', 'ENST00000379751', 'ENST00000450448', 'ENST00000296452', 'ENST00000396073', 'ENST00000359676', 'ENST00000332411', None, 'ENST00000411995', 'ENST00000396641', 'ENST00000527482', 'ENST00000301998', 'ENST00000260126', 'ENST00000257287', 'ENST00000372292', 'ENST00000282030',

In [24]:
print(len(canonical_transcript))

19343


In [57]:
count = 0
for x in canonical_transcript:
    if x == None:
        count += 1

print(count)

366


In [53]:
protein_sequence = []
for x in canonical_transcript:
    if x != None:
        protein_seq = get_protein_sequence(x)
        protein_sequence.append(protein_seq)

print(len(protein_sequence))
print(protein_sequence)

ENST00000510017
ENST00000527482
ENST00000486187
ENST00000374922
ENST00000398840
ENST00000506599
ENST00000453618
ENST00000415311
ENST00000470589
ENST00000608990
ENST00000504998
ENST00000600799
ENST00000503184
ENST00000423632
ENST00000512227
ENST00000436752
ENST00000509027
ENST00000458170
ENST00000453522
ENST00000395675
ENST00000330020
ENST00000456806
ENST00000534741
ENST00000564840
ENST00000367123
ENST00000429565
ENST00000382849
ENST00000526929
ENST00000529559
ENST00000399677
ENST00000434992
ENST00000538122
ENST00000535957
ENST00000431679
ENST00000609879
ENST00000382628
ENST00000400142
ENST00000400136
ENST00000608034
ENST00000530386
ENST00000527117
ENST00000526662
ENST00000528334
ENST00000450699
ENST00000527940
ENST00000528386
ENST00000530206
ENST00000534388
ENST00000432424
ENST00000530484
ENST00000527080
ENST00000538306
ENST00000545517
ENST00000528056
ENST00000541527
ENST00000467933
ENST00000532411
ENST00000435366
ENST00000435097
ENST00000553483
ENST00000556246
ENST00000340901
ENST0000

ConnectionError: HTTPConnectionPool(host='grch37.rest.ensembl.org', port=80): Max retries exceeded with url: /sequence/id/ENST00000540288?content-type=text/x-fasta;type=protein (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1120c3940>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [55]:
len(protein_sequence)

15117

In [56]:
with open('listfile.txt', 'w') as file:
    for x in protein_sequence:
        file.write('%s\n' % x)

In [58]:
new_canonical = []
for x in canonical_transcript:
    if x != None:
        new_canonical.append(x)

print(len(new_canonical))

18977


In [79]:
temp_canonical = new_canonical[-3667:]

In [80]:
print(temp_canonical[0])

ENST00000367728


In [82]:
print(new_canonical[15309]) #length is 15310 (including noncoding transcript), index starts at 0, ends at 15116

ENST00000394947


In [84]:
protein_sequence2 = []
for x in temp_canonical:
    protein_seq = get_protein_sequence(x)
    protein_sequence2.append(protein_seq)

ENST00000542739
ENST00000514853
ENST00000505256
ENST00000573266
ENST00000560415
ENST00000444463
ENST00000420469
ENST00000607121
ENST00000561754
ENST00000436629
ENST00000596623
ENST00000591658
ENST00000445091
ENST00000587304
ENST00000600486
ENST00000454407
ENST00000273083
ENST00000417563
ENST00000433425
ENST00000423661
ENST00000427686
ENST00000439699
ENST00000569275
ENST00000417443
ENST00000592666
ENST00000328207
ENST00000607574
ENST00000425059
ENST00000448587
ENST00000415414
ENST00000605881
ENST00000533122
ENST00000497083
ENST00000544527
ENST00000417339
ENST00000602548
ENST00000344686
ENST00000590505
ENST00000543490
ENST00000442687
ENST00000534738
ENST00000452401
ENST00000371970
ENST00000490341
ENST00000587305
ENST00000445098
ENST00000583195
ENST00000560778
ENST00000573382
ENST00000589787
ENST00000526079


In [97]:
len(protein_sequence2)

3667

In [85]:
with open('listfile2.txt', 'w') as file:
    for x in protein_sequence2:
        file.write('%s\n' % x)

In [99]:
final_canonical = new_canonical[15117:15309]

In [100]:
protein_sequence_3 = []

In [102]:
for x in final_canonical:
    protein_seq = get_protein_sequence(x)
    protein_sequence_3.append(protein_seq)

ENST00000597630
ENST00000504250


In [104]:
temp_final_protein_seq = protein_sequence + protein_sequence_3 + protein_sequence2

In [103]:
len(protein_sequence_3)

192

In [105]:
final_protein_seq = []
for x in temp_final_protein_seq:
    if x != None:
        final_protein_seq.append(x)

In [106]:
print(len(final_protein_seq))

18730


In [114]:
with open("sequences.txt", "w") as file:
    file.write(str(final_protein_seq))

In [115]:
with open("sequences.json", 'w') as f:
    # indent=2 is not needed but makes the file 
    # human-readable for more complicated data
    json.dump(final_protein_seq, f, indent=2) 

In [5]:
get_ensembl_id('CDH23')

'ENST00000398788'

In [9]:
z = get_protein_sequence('ENST00000398788')

In [10]:
len(z)

1114

In [11]:
z = get_protein_sequence('ENST00000224721')

In [12]:
len(z)

3359

In [13]:
len('MPQLYIYIRLLGAYLFIISRVQGQNLDSMLHGTGMKSDSDQKKSENGVTLAPEDTLPFLKCYCSGHCPDDAINNTCITNGHCFAIIEEDDQGETTLASGCMKYEGSDFQCKDSPKAQLRRTIECCRTNLCNQYLQPTLPPVVIGPFFDGSIRWLVLLISMAVCIIAMIIFSSCFCYKHYCKSISSRRRYNRDLEQDEAFIPVGESLKDLIDQSQSSGSGSGLPLLVQRTIAKQIQMVRQVGKGRYGEVWMGKWRGEKVAVKVFFTTEEASWFRETEIYQTVLMRHENILGFIAADIKGTGSWTQLYLITDYHENGSLYDFLKCATLDTRALLKLAYSAACGLCHLHTEIYGTQGKPAIAHRDLKSKNILIKKNGSCCIADLGLAVKFNSDTNEVDVPLNTRVGTKRYMAPEVLDESLNKNHFQPYIMADIYSFGLIIWEMARRCITGGIVEEYQLPYYNMVPSDPSYEDMREVVCVKRLRPIVSNRWNSDECLRAVLKLMSECWAHNPASRLTALRIKKTLAKMVESQDVKI')

532

In [14]:
len('MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPVRKEVSVRTGGLADKSSRKTYTFDMVFGASTKQIDVYRSVVCPILDEVIMGYNCTIFAYGQTGTGKTFTMEGERSPNEEYTWEEDPLAGIIPRTLHQIFEKLTDNGTEFSVKVSLLEIYNEELFDLLNPSSDVSERLQMFDDPRNKRGVIIKGLEEITVHNKDEVYQILEKGAAKRTTAATLMNAYSSRSHSVFSVTIHMKETTIDGEELVKIGKLNLVDLAGSENIGRSGAVDKRAREAGNINQSLLTLGRVITALVERTPHVPYRESKLTRILQDSLGGRTRTSIIATISPASLNLEETLSTLEYAHRAKNILNKPEVNQKLTKKALIKEYTEEIERLKRDLAAAREKNGVYISEENFRVMSGKLTVQEEQIVELIEKIGAVEEELNRVTELFMDNKNELDQCKSDLQNKTQELETTQKHLQETKLQLVKEEYITSALESTEEKLHDAASKLLNTVEETTKDVSGLHSKLDRKKAVDQHNAEAQDIFGKNLNSLFNNMEELIKDGSSKQKAMLEVHKTLFGNLLSSSVSALDTITTVALGSLTSIPENVSTHVSQIFNMILKEQSLAAESKTVLQELINVLKTDLLSSLEMILSPTVVSILKINSQLKHIFKTSLTVADKIEDQKKELDGFLSILCNNLHELQENTICSLVESQKQCGNLTEDLKTIKQTHSQELCKLMNLWTERFCALEEKCENIQKPLSSVQENIQQKSKDIVNKMTFHSQKFCADSDGFSQELRNFNQEGTKLVEESVKHSDKLNGNLEKISQETEQRCESLNTRTVYFSEQWVSSLNEREQELHNLLEVVSQCCEASSSDITEKSDGRKAAHEKQHNIFLDQMTIDEDKLIAQNLELNETIKIGLTKLNCFLEQDLKLDIPTGTTPQRKSYLYPSTLVRTEPREHLLDQLKRKQPELLMMLNCSENNKEETIPDVDVEEAVLGQYTEEPLSQEPSVDAGVDCSSIGGVPFFQHKKSHGKDKENRGINTLERSKVEETTEHLVTKSRLPLRAQINL')

1056