In [None]:
import shutil
from Bio import SeqIO
import os


In [None]:
#start from a genbank file of plasmid

######################
##    USER INPUT   ##
#####################
genbk = 'hM3R_hM4R_BS_pcDNA3.1(+).dna'

os.mkdir("new_design")
os.chdir("./new_design/")

#retrieve in plasmid the dna sequence of chimera
!gbseqextractor -f $genbk -prefix "DNAseq_chimera" -types wholeseq
# convert dna to protein
!Python conversion_DNA_prot.py -i DNAseq_chimera.fasta -o prot_chimera.fasta

#retrieve chimera seq
for record in SeqIO.parse(open("prot_chimera.fasta"),"fasta"):
    chimera_seq=record.seq
    if chimera_seq[-1]=="_":
        chimera_seq=chimera_seq[0:-1]

In [None]:
#start from given sequence

######################
##    USER INPUT   ##
#####################
chimera_seq= "MTLHNNSTTSPLFPNISSSWIHSPSDAGLPPGTVTHFGSYNVSRAAGNFSSPDGTTDDPLGGHTVWQVVFIAFLTGILALVTIIGNILVIVSFCFHLKSWKSNNYFLLSLACADLIIGVISMNLFTTYIIMNRWALGNLACDLWLAIDCVASNASVMNLLVISFDRYFSIVHPHHALNKISNRTAGVMIGLAWVISFVLWAPAILFWQYFVGKRTVPPGECFIQFLSEPTITFGTAIAGFYMPVTIMTILYWRIIWSLRQRQMDRHAKIKRAIQTLSAILLAFIITWTPYNIMVLVNTFCDSCIPKTFWNLGYWLCYINSTVNPVCYALCNKTFRNFFSTLINRCLQRKMTGEPDNNRSTSVELTGDPNKTRGAPEALMANSGEPWSPSYLGPTSP"
chimera_seq = chimera_seq.replace(' ','')
print(chimera_seq)

MTLHNNSTTSPLFPNISSSWIHSPSDAGLPPGTVTHFGSYNVSRAAGNFSSPDGTTDDPLGGHTVWQVVFIAFLTGILALVTIIGNILVIVSFCFHLKSWKSNNYFLLSLACADLIIGVISMNLFTTYIIMNRWALGNLACDLWLAIDCVASNASVMNLLVISFDRYFSIVHPHHALNKISNRTAGVMIGLAWVISFVLWAPAILFWQYFVGKRTVPPGECFIQFLSEPTITFGTAIAGFYMPVTIMTILYWRIIWSLRQRQMDRHAKIKRAIQTLSAILLAFIITWTPYNIMVLVNTFCDSCIPKTFWNLGYWLCYINSTVNPVCYALCNKTFRNFFSTLINRCLQRKMTGEPDNNRSTSVELTGDPNKTRGAPEALMANSGEPWSPSYLGPTSP


In [None]:
from Bio.Blast import NCBIWWW
import xml.etree.ElementTree as ET
import requests, sys
import json
from Bio import pairwise2
import pandas as pd


# 2.Look for EC parent GPCR (longest sequence donor):
result_handle = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=chimera_seq, expect="0.01",hitlist_size = 10)

# 3.Create directory where all info will be stored. Will be renamed later with name chimera which is at this point still to be defined
os.mkdir("new_design")
os.chdir("./new_design/")

# 4.Write blastp results. First hit will be considered as EC parent GPCR
with open("reference_parent.xml", "w") as out_handle:
    out_handle.write(result_handle.read())

# 5. Read from blastp results the uniprot id of the EC parent GPCR
result_xml = open("reference_parent.xml")
tree = ET.parse(result_xml)
root = tree.getroot()
ref_id = root.find('.//Hit_id').text.split("|")[1].split('.')[0]

#Or directly define EC parent UniProt ID if you know it
# ref_id = "P20309"

# 6.Retrieve the entire sequence and name of the EC parent GPCR from uniprot
requestURL = f"https://rest.uniprot.org/uniprotkb/{ref_id}.json"
r = requests.get(requestURL, headers={ "Accept" : "application/json"})
if not r.ok:
  r.raise_for_status()
  sys.exit()
raw_data = r.text
raw_data_ref = json.loads(raw_data)
ref_seq = raw_data_ref["sequence"]["value"]
ref_name=raw_data_ref['uniProtkbId']

# 7. Local alignment of the chimera and the EC parent GPCR
alignments = pairwise2.align.localxx(chimera_seq, ref_seq)
alignment = alignments[0]
aligned_chimera = alignment.seqA
aligned_ref = alignment.seqB

# 8. Write the alignment in the "new_design" folder
with open("alignment_chimera_reference.fasta", "w") as align:
  align.write(">chimera \n")
  align.write(aligned_chimera + "\n")
  align.write(f">{ref_id} \n")
  align.write(aligned_ref + "\n")

In [None]:
from Bio import SeqIO

# 9. Define the cutting points in the EC parent GPCR. 
# Open "alignment_chimera_reference.fasta" file and define the cutting points
# Must be write like this: 
# if until residue 137 of the aligned chimera it is from the EC parent
# if from 161 to 235 of the aligned chimera it is from the EC parent
# if from 308 to 361 of the aligned chimera it is from the EC parent
# write:
# cutting_points_ref = {"ref1" : [1,137],
#                  "ref2" : [161,235],
#                  "ref3" : [308,361]}
# The code will automatically convert the limits into python list limits (start counting at 0)
# The code will automatically translate the aligned sequences to unaligned sequences
######################
##    USER INPUT   ##
#####################

cutting_points_ref = {
                 "ref1" : [1,92],
                 "ref2" : [110,177],
                 "ref3" : [201,268],
                 "ref4" : [510,571],}

#this enables to take into account eventual corrections we do to the MSA
for record in SeqIO.parse(open("alignment_chimera_reference.fasta"),'fasta'):
    if record.id == "chimera":
        aligned_chimera = record.seq
    else:
        aligned_ref = record.seq

# 10. Translate the cutting points limits into the unaligned positions of the EC parent
previous = 0
translate = []
msa = aligned_ref
sequence_nogaps = ref_seq
for res in range(len(sequence_nogaps)):
    couple = []
    couple.append(res+1)
    idx_msa = previous + msa[previous:].index(sequence_nogaps[res])
    couple.append(idx_msa+1)
    translate.append(couple)
    previous = idx_msa + 1

cutting_points_final_ref = {}
for points in cutting_points_ref:
    limits = cutting_points_ref[points]
    lower = limits[0]
    upper = limits[1]
    for couple in translate:
        if couple[1]==lower:
            lower_translate = couple[0]
        if couple[1]==upper:
            upper_translate = couple[0]
            break
    cutting_points_final_ref[points]=[lower_translate,upper_translate] 
    #final cutting points in "human" format (if writte [150,224], take from residue 150 (included) to residue 224 (included))


# 11.Translate cutting points in python format
cutting_points_python = {}
for point in cutting_points_ref:
  limits = cutting_points_ref[point]
  cutting_points_python[point]=[limits[0]-1,limits[1]] #because python starts counting at 0 and stops at upper limit - 1 for the lists

# 12.Retrieve the parts of the chimera that are not from the EC parent GPCR
parts_target = ""
parts_target_list=[]
keys = list(cutting_points_python.keys())
if cutting_points_python[keys[0]][0] != 1: #means that the IC parent has actually been found not the EC parent
    parts_list = ""
    upper = cutting_points_python[keys[0]][0]
    for residue in aligned_chimera[0:upper]:
        if residue != "-":
            parts_target+=residue
            parts_list +=residue
    parts_target_list.append(parts_list)
#retrieve other parts
for key in range(len(keys)-1):
    parts_list = ""
    upper = cutting_points_python[keys[key]][1]
    next_lower = cutting_points_python[keys[key+1]][0]
    for residue in aligned_chimera[upper:next_lower]:
        if residue != "-":
            parts_target+=residue
            parts_list +=residue
    parts_target_list.append(parts_list)

next_upper= cutting_points_python[keys[-1]][1]
parts_list = ""
for residue in aligned_chimera[next_upper:]:
    if residue != "-":
        parts_target+=residue
        parts_list +=residue
parts_target_list.append(parts_list)

print("In the alignment_chimera_target.fasta file in the folder, identify the positions of these sections (parts from IC parent GPCR) \n")
print(parts_target_list)

# 13.Run blastp again to find the IC parent GPCR
result_handle = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=parts_target, expect="0.01",hitlist_size = 10)

# 14.Write blastp results. First hit will be considered as IC parent GPCR
with open("target_parent.xml", "w") as out_handle:
    out_handle.write(result_handle.read())

# 15. Read from blastp results the uniprot id of the IC parent GPCR
result_xml = open("target_parent.xml")
tree = ET.parse(result_xml)
root = tree.getroot()
target_id = root.find('.//Hit_id').text.split("|")[1].split('.')[0]

#Or directly define IC parent UniProt ID
# target_id = "Q8TDS4"

# 16.Retrieve the entire sequence and name of the IC parent GPCR from uniprot
requestURL = f"https://rest.uniprot.org/uniprotkb/{target_id}.json"
r = requests.get(requestURL, headers={ "Accept" : "application/json"})
if not r.ok:
  r.raise_for_status()
  sys.exit()
raw_data = r.text
raw_data_target = json.loads(raw_data)
target_seq = raw_data_target["sequence"]["value"]
target_name=raw_data_target['uniProtkbId']

# 17. Local alignment of the chimera and the IC parent GPCR
alignments = pairwise2.align.localxx(chimera_seq, target_seq)
alignment = alignments[0]
aligned_chimera = alignment.seqA
aligned_target = alignment.seqB

# 18. Write the alignment in the "new_design" folder
with open("alignment_chimera_target.fasta", "w") as align:
  align.write(">chimera \n")
  align.write(aligned_chimera + "\n")
  align.write(f">{target_id} \n")
  align.write(aligned_target + "\n")

In the alignment_chimera_target.fasta file in the folder, identify the positions of these sections (parts from target GPCR) 

['', 'FCFHLKSWKS', 'VHPHHALNKISNRT', 'RIIWSLRQRQMDRHAKIKRAI', 'NFFSTLINRCLQRKMTGEPDNNRSTSVELTGDPNKTRGAPEALMANSGEPWSPSYLGPTSP']


In [None]:
# 19. Define the cutting points in the IC parent GPCR. 
# Open "alignment_chimera_target.fasta" file and define the cutting points
# By looking up the regions printed out in the cell above
# Must be write like this: 
# if from 217 to 229 of the aligned chimera it is from the IC parent
# if from 351 to 405 of the aligned chimera it is from the IC parent
# if from 489 to 579 of the aligned chimera it is from the IC parent
# write:
# cutting_points_target = {"target1" : [217,229],
#                  "target2" : [351,405],
#                  "target3" : [489,579]}
# The code will automatically convert the limits into python list limits (start counting at 0)
# The code will automatically translate the aligned sequences to unaligned sequences
######################
##    USER INPUT   ##
#####################
cutting_points_target = {
                 "target1" : [120,129],
                 "target2" : [238,251],
                 "target3" : [361,381],
                 "target4" : [492,552],}


#this enables to take into account eventual corrections we do to the MSA
for record in SeqIO.parse(open("alignment_chimera_target.fasta"),'fasta'):
    if record.id == "chimera":
        aligned_chimera = record.seq
    else:
        aligned_target = record.seq

# 20. Translate the cutting points limits into the unaligned positions of the IC parent
previous = 0
translate = []
msa = aligned_target
sequence_nogaps = target_seq
for res in range(len(sequence_nogaps)):
    couple = []
    couple.append(res+1)
    idx_msa = previous + msa[previous:].index(sequence_nogaps[res])
    couple.append(idx_msa+1)
    translate.append(couple)
    previous = idx_msa + 1

cutting_points_final_target = {}
for points in cutting_points_target:
    limits = cutting_points_target[points]
    lower = limits[0]
    upper = limits[1]
    for couple in translate:
        if couple[1]==lower:
            lower_translate = couple[0]
        if couple[1]==upper:
            upper_translate = couple[0]
            break
    cutting_points_final_target[points]=[lower_translate,upper_translate] 
    #final cutting points in "human" format (if writte [150,224], take from residue 150 (included) to residue 224 (included))

In [None]:
# #in case you need to switch between EC parent and IC parent
# #names
# temp = ref_name
# ref_name = target_name
# target_name = temp

# #ids
# temp = ref_id
# ref_id = target_id
# target_id = temp

# #cutting points
# temp = cutting_points_final_ref
# cutting_points_final_ref = cutting_points_final_target
# cutting_points_final_target = temp

In [58]:
#Create a fasta file for the chimera
#put sequence that we entered and sequence created by assembling the 2 parents. 
#If they don't fit perfectly, then we made an error in the cutting points definition
cutting = {}
# Iterate over the keys of dict_a and dict_b simultaneously
for key_ref, key_target in zip(cutting_points_final_ref.keys(),cutting_points_final_target.keys()):
    cutting[key_ref] = cutting_points_final_ref[key_ref]
    cutting[key_target] = cutting_points_final_target[key_target]

# If dict_a and dict_b have different lengths, add the remaining elements from the longer dictionary
remaining_keys = set(cutting_points_final_ref.keys()) ^ set(cutting_points_final_target.keys())  # Find the keys that are not present in both dictionaries

for key in remaining_keys:
    if key in cutting_points_final_ref:
        cutting[key] = cutting_points_final_ref[key]
    else:
        cutting[key] = cutting_points_final_target[key]
print(cutting)

chimera_parts= []
previous_len = 0
novel_sequence="" #compare the original given chimera sequence and the sequence that we can make with the found cutting points. The 2 sequences must match otherwise error in the cutting points
for point in cutting.keys():
    lower=cutting[point][0]
    upper=cutting[point][1]
    if "ref" in point:
        novel_sequence+=ref_seq[lower-1:upper]
    else:
        novel_sequence+=target_seq[lower-1:upper]
    chimera_parts.append([previous_len+1,len(novel_sequence)])
    previous_len=len(novel_sequence) #to find what are the different parts in the chimera sequence

with open("chimera.fasta","w") as f:
    f.write(">chimera\n")
    f.write(str(chimera_seq) +"\n")
    f.write(">Created with code \n")
    f.write(novel_sequence)

{'ref1': [1, 92], 'target1': [52, 61], 'ref2': [103, 170], 'target2': [130, 143], 'ref3': [185, 252], 'target3': [210, 230], 'ref4': [491, 552], 'target4': [303, 363]}


In [59]:
# 21.Write all info in excel file
# Load the Excel file
os.chdir("../../")
df = pd.read_excel('previous_designs.xlsx', sheet_name="GPCRs_A")

# Retrieve the 'Chimera_name' column data to check which chimera names have a
already_designed_chimeras = list(df['Chimera_name'])

chimera_name = ref_name + "_" + target_name + "_"
counter = 1
for entry in already_designed_chimeras:
    if entry.startswith(chimera_name):
        counter += 1
chimera_name = chimera_name + str(counter)
id_chimera_name = ref_id+"_"+target_id+ "_"+str(counter)
already_designed_chimeras.append(chimera_name)

# #Rename the folder with the name of the design
# os.rename("./Found_designs/new_design/chimera.fasta","./Found_designs/new_design/"+chimera_name+".fasta")
os.rename("./Found_designs/new_design","./Found_designs/"+chimera_name)

new_row = {
    'Chimera_name': chimera_name,
    'Chimera_name_ids' : id_chimera_name,
    'Chimera_parts':chimera_parts,
    'Reference_name': ref_name,
    'Reference_id': ref_id,
    'Reference_cutting_points': cutting_points_final_ref,
    'Target_name': target_name,
    'Target_id': target_id,
    'Target_cutting_points': cutting_points_final_target,
    'Expression?': 'Yes',
    'Function?': 'New function',
    'Comments': 'Additional comments',
    'Authors': '',
    'DOI': ''
}
# Append the new row to the dataframe
df_dictionary = pd.DataFrame([new_row])
df = pd.concat([df, df_dictionary], ignore_index=True)

df.to_excel('previous_designs.xlsx',sheet_name='GPCRs_A',index=False)


In [60]:
#Find mutated residues in chimera compared to parents
#if print nothing then no mutations

for record in SeqIO.parse(open("./Found_designs/"+chimera_name+"/chimera.fasta"),"fasta"):
    if "Created" in record.id:
        novel_sequence = record.seq
    else:
        chimera_seq = record.seq

pos=1
mutated=""
for res_chimera, res_parents in zip(chimera_seq,novel_sequence):
    if res_chimera != res_parents:
        mutated+=f"{res_parents}{pos}{res_chimera},"
    pos +=1
print(mutated)

Y149C,A239G,


In [61]:
#remove the sequence that was added to the fasta file of the chimera to check if we found the correct cutting points
for record in SeqIO.parse(open("./Found_designs/"+chimera_name+"/chimera.fasta"),"fasta"):
    seq = str(record.seq)
    break
with open("./Found_designs/"+chimera_name+"/"+chimera_name+".fasta", "w") as f:
    f.write(">"+chimera_name.replace('-','')+"\n")
    f.write(seq +"\n")
    
os.remove("./Found_designs/"+chimera_name+"/chimera.fasta")