In [1]:
import os
import pandas as pd
from datetime import datetime

from graphdb_connector import connector

In [2]:
def create_sequence_alignment(query_fasta, taxid = "9606"):
    '''
    sstart means Start of alignment in subject
    send means End of alignment in subject
    qseq means Aligned part of query sequence
    sseq means Aligned part of subject sequence'''
    
    import subprocess
    outputFile = "/".join(query_fasta.split('/')[0:-1])+"/alignment.fa"
    blast_request = "blastp -query "+query_fasta+" -db ../../../../../../Databases/UniProt/"+taxid+" -evalue 1 -max_target_seqs 1 -num_threads 6 -outfmt \"6 qseqid sseqid sstart send qseq sseq\" -parse_deflines -out "+outputFile
    execution = subprocess.Popen(blast_request, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout, stderr = execution.communicate()
    if stderr != "":
        print(stderr)
        
    return outputFile


def build_mapping_from_alignment(alignment_file):
    with open(alignment_file, 'r') as af:
        df = pd.read_csv(af, sep='\t', header=None)
        df.columns = ['previous_id', 'new_id', 'start', 'end', 'previous_sequence', 'new_sequence']
        df['new_id'] = [ident.split('|')[1] for ident in df['new_id']]
        df = df[df['previous_id'] != df['new_id']]
        idmapping = dict(zip(df['previous_id'], df['new_id']))
        seqmapping = dict(zip(df['previous_id'], df['new_sequence']))
        
    return df, idmapping, seqmapping


def graph_to_fasta(output_dir='tmp'):
    now = datetime.now()
    timestamp = str(datetime.timestamp(now))

    output_file = os.path.join(output_dir, timestamp+".fasta")
    query = "MATCH (p:Protein)-[:HAS_SEQUENCE]-(s:Amino_acid_sequence) RETURN p.id AS protein, s.sequence AS sequence"
    
    driver = connector.getGraphDatabaseConnectionConfiguration()
    
    table = connector.getCursorData(driver, query)
    table_to_fasta(table, output_file)
    
    return output_file

def table_to_fasta(table, output_file):
    with open(output_file,'w') as out:
        for i,r in table.iterrows():
            out.write(">"+str(r['protein'])+"\n"+str(r['sequence'])+"\n")
    

In [3]:
query_fasta_file = graph_to_fasta()
alignment_file = create_sequence_alignment(query_fasta_file, taxid = "9606")
df, mapping_id, mapping_seq = build_mapping_from_alignment(alignment_file)



In [4]:
df.iloc[0][['previous_sequence', 'new_sequence']].values

array(['MRLPAQLLGLLMLWVSGSSGDIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTP',
       'MRLPAQLLGLLMLWVSGSSGDIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTP'],
      dtype=object)

In [10]:
mapping_id['A0A0U1RR20']

KeyError: 'A0A0U1RR20'