## NCBI format 

In [1]:
## For this method, the .fa file needs to have a GeneId (GeneID=)
def create_splicefile_NCBI(file, formato):
    '''
    Require num. Create splicefile from multifasta file, specific for NCBI fastas.
    '''
    import numpy as np
    import re
    from Bio import SeqIO
    geneids = []
    for seq_record in SeqIO.parse(file, formato):
        string_to_split = seq_record.description
        split = string_to_split.split('GeneID=') [1]
        split = split.split(']')[0]
        split = split.replace(']','')
        geneids.append(split)


    splicedic = {}
    counter = -1

    for index, record in enumerate(SeqIO.parse(file, formato)):
        counter+= 1
        key = geneids[counter]
        splicedic.setdefault(key, [])
        splicedic[key].append(record.id)

    
    temp = file.split('.fa')[0] +'.temp'
    with open(temp, 'w') as outfile:
        for key in splicedic:
        
            for i in splicedic[key]:
                print(i, end =";", file=outfile)
            print('\n', file = outfile)
            
    splicefile = temp.split('.temp')[0] + '.splice'
    with open(temp,'r') as infile:
        lines = infile.readlines()
    
        with open(splicefile, 'w') as outfile:
            for l in lines:
                line = re.sub(";$","", l)
                if len(line.strip('\n'))>1 and (len(re.findall(';', line)))>0:
                    outfile.write(line)
                    
    return 'Done. Check your folder'

#create_splicefile_NCBI("gal6.fa", "fasta")

In [None]:
## Method 2, no GeneID --> done with gff files instead

import pandas as pd

## CHECK THE DIRECTORY IS CORRECT 

#corr=pd.read_csv('/work/FAC/FBM/DBC/cdessim2/default/sprietob/ncbi_dataset/data/SpeciesInfo.csv')

corr.rename(columns={"Correct NCBI refseq": "Correct_NCBI_refseq"}, inplace=True)


##### SPECIFIC FOR NCBI. All folders in original_dir need to be genomes folders with genomic.gff 
##### (& protein.faa) inside 


import os
import gffutils


## CHECK THE DIRECTORIES ARE CORRECT

#original_dir = '/work/FAC/FBM/DBC/cdessim2/default/sprietob/ncbi_dataset/data/GCFs/'
#splice_dir = '/work/FAC/FBM/DBC/cdessim2/default/sprietob/OMAtopNCBI20/DB/'

for folder in os.listdir(original_dir):
    gene_to_prot= {}
    prot_to_gene = {}
    folder_path = original_dir + folder
    spcode= corr.loc[corr.Correct_NCBI_refseq== folder, 'Code'].values[0]
    
    if os.path.isdir(folder_path):
        gff = folder_path+'/genomic.gff'
        db = gffutils.create_db(gff, ':memory:', merge_strategy="create_unique", keep_order=True)
        # Loop through all genes
        for t in db.features_of_type('gene', order_by='start'):
            gene = t.id
            gene_list = []
            ordered_child = list(db.children(t, featuretype='CDS', order_by='start'))
            
            # Loop through all children of genes
            for child in ordered_child:
                type_attribute = ['protein_id', 'Name']
                
                # Loop through all proteins of children??
                for att_type in type_attribute:
                    protein = child.attributes.get(att_type, [None])[0]
                    if protein:
                
                        break
                if not protein:
                    print('warning')
                    print(child)
                    continue
                corr_gene = prot_to_gene.get(protein, None)
                if corr_gene and corr_gene!=gene:
                    gene = corr_gene
                    for other_prot in gene_list:
                        prot_to_gene[other_prot] = gene
                    gene_list = gene_to_prot[gene]+gene_list
                else:
                    prot_to_gene[protein] = gene
                if protein not in gene_list:
                    gene_list.append(protein)
            if len(gene_list)!=0:
                gene_to_prot[gene] = gene_list
                
#UNCOMMENT FOR EXPORT

   # with open(splice_dir+spcode+'.splice','w') as handle_output:
    #    for val in gene_to_prot.values():
     #       handle_output.write(";".join(val)+'\n')

## Ensembl format

In [2]:
def create_splicefile_OMA(file, formato):
    '''
    Require num and biopython. 
    Create splicefile from multifasta file, specific for Ensembl fastas with pipes separated header ids (OMA).
    '''
    from Bio import SeqIO
    import numpy as np
    import re
    #Make array with gene IDs
    geneids = []
    for seq_record in SeqIO.parse(file, formato):
        string_to_split = seq_record.description
        split = string_to_split.split(' | ') [3]
        split = split.split(' | ')[0]
        geneids.append(split)


    # Make dictionary with gene ids from previous array as keys. Iterate through the sequences in fasta and add
    # individual unique record ids as values to each gene id's key.
    splicedic = {}
    counter = -1

    for index, record in enumerate(SeqIO.parse(file, formato)):
        counter+= 1
        key = geneids[counter]
        splicedic.setdefault(key, [])
        splicedic[key].append(record.id)

    
    temp = file.split('.fa')[0] +'.temp'
    with open(temp, 'w') as outfile:
        for key in splicedic:
        
            for i in splicedic[key]:
                print(i, end =";", file=outfile)
            print('\n', file = outfile)
            
    splicefile = temp.split('.temp')[0] + '.splice'
    with open(temp,'r') as infile:
        lines = infile.readlines()
    
        with open(splicefile, 'w') as outfile:
            for l in lines:
                line = re.sub(";$","", l)
                if len(line.strip('\n'))>1 and (len(re.findall(';', line)))>0:
                    outfile.write(line)
                    
    return 'Done. Check your folder'

In [20]:
def create_splicefile_Ensembl(file, formato):
    '''
    Require num and biopython. 
    Create splicefile from multifasta file, specific for Ensembl fastas with gene:id and no pipes.
    '''
    from Bio import SeqIO
    import numpy as np
    import re

    splicedic={}

    for seq_record in SeqIO.parse(file, formato):
        string_to_split = seq_record.description
        gid = string_to_split.split('gene:')[1]
        gid = gid.split(' transcript')[0]
        splicedic.setdefault(gid,[])
        splicedic[gid].append(seq_record.id)

    
    temp = file.split('.fa')[0] +'.splice'
    with open(temp, 'w') as outfile:
        for key in splicedic:
            print(';'.join(splicedic[key]), file=outfile)
            
                    
    return 'Done. Check your folder'

# Implementation

In [18]:
wdir='/users/sprietob/mywork/ToyDatasetCharles/'

In [21]:
import glob
formato = 'fasta'
names=[]
for f in glob.glob(wdir + "AEDAE2.fa"):
    name = f.split('.fa')[0]
    name=name.split('Charles/')[1]
    names.append(name)
    create_splicefile_Ensembl(f, formato)
print(len(names))

1


In [3]:
create_splicefile_OMA("HUMAN.fa", "fasta")

'Done. Check your folder'