In [1]:
# necessary imports
from urllib.request import urlopen
import json
import collections
import os

In [2]:
# store the URL in url as parameter for urlopen
url = 'https://www.ebi.ac.uk/proteins/api/variation/P04637?format=json'

# store the response of URL
response = urlopen(url)

# storing the JSON response from url in data
data = json.loads(response.read())

print(data.keys())
print('\n')
print(data['features'][0].keys())

dict_keys(['accession', 'entryName', 'proteinName', 'geneName', 'organismName', 'proteinExistence', 'sequence', 'sequenceChecksum', 'sequenceVersion', 'taxid', 'features'])


dict_keys(['type', 'alternativeSequence', 'begin', 'end', 'xrefs', 'cytogeneticBand', 'genomicLocation', 'locations', 'consequenceType', 'wildType', 'mutatedType', 'somaticStatus', 'clinicalSignificances', 'association', 'descriptions', 'sourceType'])


In [3]:
# retriveing geneName, organismName and original sequence
geneName = data['geneName']
organismName = data['organismName']
sequence = data['sequence']

# initializing mutations dictionaries for storing them (known mutations is for the ones came from UniProt, generated mutations are the ones we did and finally allmutations is for combining them into 1 single dictionary)
allmutations = collections.defaultdict(dict)

# storing all aminoacids for comparisons
aminoacids = 'ACDEFGHIKLMNPQRSTVWY'

print(geneName)
print('\n')
print(organismName)
print('\n')
print(sequence)

TP53


Homo sapiens


MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD


In [4]:
# iterating through the json for collecting every variant from UniProt
for features in data['features']:
    if features.get('begin') == features.get('end'):
        if (features.get('mutatedType') != None and (features.get('mutatedType') in aminoacids and features.get('wildType') in aminoacids)):
            variantSequence = sequence[:int(features.get('begin'))-1] + features.get('mutatedType') + sequence[int(features.get('begin')):]
            allmutations[variantSequence]['name'] = features.get('locations')[0].get('loc')
            allmutations[variantSequence]['description'] = features.get('descriptions')[0].get('value') if features.get('descriptions') != None else ""
            allmutations[variantSequence]['position'] = int(features.get('begin'))
            allmutations[variantSequence]['originalType'] = features.get('wildType')
            allmutations[variantSequence]['mutatedType'] = features.get('mutatedType')

print(allmutations[variantSequence].values())

dict_values(['p.Asp393Tyr', '', 393, 'D', 'Y'])


In [5]:
# generating and storing mutations
count = 0
for original in range(0, len(sequence)):
    for mutated in range(0, len(aminoacids)):
        if sequence[original] == aminoacids[mutated]:
            continue
        else:
            newsequence = sequence[:original] + aminoacids[mutated] + sequence[original+1:]
            # for making sure not adding existing mutations again
            if newsequence not in allmutations.keys():
                allmutations[newsequence]['name'] = 'GeneratedMutation' + str(count)
                allmutations[newsequence]['description'] = ""
                allmutations[newsequence]['position'] = original+1
                allmutations[newsequence]['originalType'] = sequence[original]
                allmutations[newsequence]['mutatedType'] = aminoacids[mutated]
                count += 1

print(allmutations[list(allmutations.keys())[len(list(allmutations.keys()))-1]].values())

dict_values(['GeneratedMutation5638', '', 393, 'D', 'W'])


In [6]:
# generating files for doing queries on their respective algorithms
with open(os.path.join('prediction queries', 'tp53.fasta'), "w") as outfile:
    outfile.write('>'+geneName+' '+organismName)
    outfile.write('\n')
    outfile.write(sequence)
outfile.close()

with open(os.path.join('prediction queries', 'siftquery.txt'), "w") as outfile:
    for i in allmutations.keys():
        outfile.write(allmutations[i]['originalType']+str(allmutations[i]['position'])+allmutations[i]['mutatedType'])
        outfile.write('\n')
outfile.close()

with open(os.path.join('prediction queries', 'polyphenquery.txt'), "w") as outfile:
    for i in allmutations.keys():
        outfile.write('P04637'+' '+str(allmutations[i]['position'])+' '+allmutations[i]['originalType']+' '+allmutations[i]['mutatedType'])
        outfile.write('\n')
outfile.close()

In [7]:
# PolyPhen and SIFT matrix creations and storing them in their own respective dictionaries
siftpredictionmatrixtype = collections.defaultdict(dict)
siftpredictionmatrixvalue = collections.defaultdict(dict)
polyphen2predictionmatrixtype = collections.defaultdict(dict)
polyphen2predictionmatrixvalue = collections.defaultdict(dict)

with open(os.path.join('inputs', 'sift predictions.txt')) as f:
    for line in f:
        predictionvalues = line.split()
        for aminoacid in range(0, len(aminoacids)):
            if (float(predictionvalues[aminoacid+2])>0.05):
                predictionValType = 'tolerated'
            else:
                predictionValType = 'damaging'

            siftpredictionmatrixtype[predictionvalues[0]][aminoacids[aminoacid]] = predictionValType
            siftpredictionmatrixvalue[predictionvalues[0]][aminoacids[aminoacid]] = float(predictionvalues[aminoacid+2])

with open(os.path.join('inputs', 'polyphen predictions.txt')) as f:
   for line in f:
        predictionvalues = line.split()
        if (predictionvalues[9] == 'benign'):
            polyphen2predictionmatrixtype[predictionvalues[6]+predictionvalues[7]][predictionvalues[8]] = predictionvalues[9]
            polyphen2predictionmatrixvalue[predictionvalues[6]+predictionvalues[7]][predictionvalues[8]] = float(predictionvalues[10])
        else:
            polyphen2predictionmatrixtype[predictionvalues[6]+predictionvalues[7]][predictionvalues[8]] = predictionvalues[9] + ' ' + predictionvalues[10]
            polyphen2predictionmatrixvalue[predictionvalues[6]+predictionvalues[7]][predictionvalues[8]] = float(predictionvalues[11])

print("Sift Prediction Result: "+siftpredictionmatrixtype['1M']['Q'] + ' ' + str(siftpredictionmatrixvalue['1M']['Q']))
print('\n')
print("Polyphen2 Precition Result: "+polyphen2predictionmatrixtype['1M']['Q'] + ' ' + str(polyphen2predictionmatrixvalue['1M']['Q']))

Sift Prediction Result: damaging 0.0


Polyphen2 Precition Result: possibly damaging 0.874


In [8]:
# adding prediction values for both algoritms into the combined mutations dictionary
for i in allmutations.keys():
    allmutations[i]['siftpredictionType'] = siftpredictionmatrixtype[str(allmutations[i]['position'])+allmutations[i]['originalType']][allmutations[i]['mutatedType']]
    allmutations[i]['siftpredictionValue'] = siftpredictionmatrixvalue[str(allmutations[i]['position'])+allmutations[i]['originalType']][allmutations[i]['mutatedType']]
    allmutations[i]['polyphen2predictionType'] = polyphen2predictionmatrixtype[str(allmutations[i]['position'])+allmutations[i]['originalType']][allmutations[i]['mutatedType']]
    allmutations[i]['polyphen2predictionValue'] = polyphen2predictionmatrixvalue[str(allmutations[i]['position'])+allmutations[i]['originalType']][allmutations[i]['mutatedType']]

print(allmutations[list(allmutations.keys())[len(list(allmutations.keys()))-1]].values())

dict_values(['GeneratedMutation5638', '', 393, 'D', 'W', 'damaging', 0.0, 'probably damaging', 0.995])


In [9]:
# creating json file for r
with open(os.path.join('outputs', 'dictionary.csv'), "w") as outfile:
    outfile.write("name,sequence,description,position,originalType,mutatedType,siftpredictionType,siftpredictionValue,polyphen2predictionType,polyphen2predictionValue")
outfile.close()

with open(os.path.join('outputs', 'dictionary.csv'), "a") as outfile:
    for i in allmutations.keys():
        outfile.write('\n')
        outfile.write(allmutations[i]['name']+','+i+','+allmutations[i]['description']+','+str(allmutations[i]['position'])+','+allmutations[i]['originalType']+','+allmutations[i]['mutatedType']+','+allmutations[i]['siftpredictionType']+','+str(allmutations[i]['siftpredictionValue'])+','+allmutations[i]['polyphen2predictionType']+','+str(allmutations[i]['polyphen2predictionValue']))
outfile.close()

In [10]:
# Sift and Polyphen2 algorithms results 

In [10]:
# Polyphen2 results of every mutation in the list 
for i in range(0, len(sequence)):
    for aminoacid in aminoacids:
        if(sequence[i]!=aminoacid):
            print("Sift Prediction Result From "+str(i+1)+sequence[i]+" to "+aminoacid+" "+polyphen2predictionmatrixtype[str(i+1)+sequence[i]][aminoacid] + ' ' + str(polyphen2predictionmatrixvalue[str(i+1)+sequence[i]][aminoacid]))
        else:
            print(str(i+1)+sequence[i]+" == "+aminoacid+", Aminoacid location: "+str(i+1))

Sift Prediction Result From 1M to A benign 0.333
Sift Prediction Result From 1M to C probably damaging 0.933
Sift Prediction Result From 1M to D probably damaging 0.912
Sift Prediction Result From 1M to E possibly damaging 0.874
Sift Prediction Result From 1M to F possibly damaging 0.824
Sift Prediction Result From 1M to G possibly damaging 0.824
Sift Prediction Result From 1M to H probably damaging 0.933
Sift Prediction Result From 1M to I benign 0.22
Sift Prediction Result From 1M to K possibly damaging 0.818
Sift Prediction Result From 1M to L benign 0.226
1M == M, Aminoacid location: 1
Sift Prediction Result From 1M to N probably damaging 0.912
Sift Prediction Result From 1M to P probably damaging 0.912
Sift Prediction Result From 1M to Q possibly damaging 0.874
Sift Prediction Result From 1M to R probably damaging 0.909
Sift Prediction Result From 1M to S possibly damaging 0.739
Sift Prediction Result From 1M to T possibly damaging 0.663
Sift Prediction Result From 1M to V benign 

In [11]:
# Sift results of every mutation in the list 
for i in range(0, len(sequence)):
    for aminoacid in aminoacids:
        print("Sift Prediction Result From "+str(i+1)+sequence[i]+" to "+aminoacid+" "+siftpredictionmatrixtype[str(i+1)+sequence[i]][aminoacid] + ' ' + str(siftpredictionmatrixvalue[str(i+1)+sequence[i]][aminoacid]))

Sift Prediction Result From 1M to A damaging 0.0
Sift Prediction Result From 1M to C damaging 0.0
Sift Prediction Result From 1M to D damaging 0.0
Sift Prediction Result From 1M to E damaging 0.0
Sift Prediction Result From 1M to F damaging 0.0
Sift Prediction Result From 1M to G damaging 0.0
Sift Prediction Result From 1M to H damaging 0.0
Sift Prediction Result From 1M to I damaging 0.0
Sift Prediction Result From 1M to K damaging 0.0
Sift Prediction Result From 1M to L damaging 0.0
Sift Prediction Result From 1M to M tolerated 1.0
Sift Prediction Result From 1M to N damaging 0.0
Sift Prediction Result From 1M to P damaging 0.0
Sift Prediction Result From 1M to Q damaging 0.0
Sift Prediction Result From 1M to R damaging 0.0
Sift Prediction Result From 1M to S damaging 0.0
Sift Prediction Result From 1M to T damaging 0.0
Sift Prediction Result From 1M to V damaging 0.0
Sift Prediction Result From 1M to W damaging 0.0
Sift Prediction Result From 1M to Y damaging 0.0
Sift Prediction Res

In [15]:
sequence[61]

'E'

In [None]:
Conserved Positions
2,4,5,8,9,11,12,15,21,22
