In [2]:
import os
from datetime import datetime

import gget

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import arcadia_pycolor as apc
apc.mpl_setup()


In [10]:
query_cover_thresh = 50 #min % of query covered for a homolog 
perident_thresh= 30 #min % percent identity for a homolog

In [None]:
#import blast results and get gene names from each ensembl id
blast_results_folder = '../inputs/dRE_BlastResults_2023_06_09/'
gene_names_path = '..inputs/geneID_Region_andPathogenicLength.csv'

try: 
    genenames = pd.read_csv(gene_names_path)
except:
    genenames = {}
    for i, filename in enumerate(os.listdir(blast_results_folder)):

        ensembl_id = filename.replace('.csv','')
        geneinfo = gget.info(ensembl_id)
        name= geneinfo["primary_gene_name"].values
        genenames.update({ensembl_id:name[0]})
        print(str(i+1) + "/" + str(len(os.listdir(blast_results_folder))) + " gene names found")

    genenames = pd.DataFrame.from_dict(genenames, orient = "index")
    genenames.reset_index(inplace=True)
    genenames.rename(columns={"index":"Eid", 0: "gene"}, inplace=True)
    genenames.to_csv(gene_names_path)


In [25]:
blast_results_folder = '../inputs/dRE_BlastResults_2023_06_09/'
gene_names_path = '../inputs/geneID_Region_andPathogenicLength.csv'
genenames = pd.read_csv(gene_names_path)


In [29]:
#filter blast results by percent identity and query coverage
results_list = []

for filename in os.listdir(blast_results_folder):
    filepath = blast_results_folder + filename
    blast_result = pd.read_csv(filepath)
    
    ensembl_id = filename.replace('.csv','')
    blast_result['gene'] = genenames.loc[genenames["GeneId"].str.match(ensembl_id),"Locus ID"].values[0]


    blast_result['Query Cover'] = blast_result['Query Cover'].str.rstrip('%').astype('float')
    blast_result['Per. Ident'] = blast_result['Per. Ident'].str.rstrip('%').astype('float')                      

    results_list.append(blast_result)

results_df = pd.concat(results_list)
results_df = results_df[
            (results_df['Per. Ident'] > perident_thresh) & 
            (results_df['Query Cover'] > query_cover_thresh)
            ].reset_index(drop=True)

In [31]:
tax_names = pd.read_csv('../inputs/NCBI_taxdump_20230612/names.dmp', delimiter="|", header=None, quotechar='\t')
#fix taxids that got missed
tax_names.columns = ['Taxid','Scientific Name','NA1','NA2','NA3']

# Filter for errors and extract scientific name from within brackets of the "description" column
taxids_to_fix = results_df[results_df["Taxid"]==0].copy().reset_index() 
taxids_to_fix["Scientific Name"] = taxids_to_fix["Description"].str.extract(r'\[(.*?)\]')

#Merge the filtered DataFrame with the mapping DataFrame
merged_df = pd.merge(taxids_to_fix, tax_names[['Scientific Name', 'Taxid']], on='Scientific Name', how='left')

#Update the "Taxid" column with values from the mapping DataFrame
merged_df['Taxid'] = merged_df['Taxid_y'].fillna(merged_df['Taxid_x']).astype(int)

# Drop unnecessary columns from the merged DataFrame
merged_df = merged_df.drop(['Taxid_x', 'Taxid_y'], axis=1)

#create a copy and update Taxid and Scientific Name
temp_df = results_df.copy()
temp_df.loc[temp_df['Taxid'] == 0, 'Taxid'] = merged_df['Taxid'].values

results_df.update(temp_df)

#drop those we couldn't fix
results_df = results_df[results_df["Taxid"]!=0]

In [33]:
today = datetime.today().strftime('%Y%m%d')

#save homology results_df to csv
filename = "../results/homology_results/dREhomologs_" + today +".csv"
results_df.to_csv(filename)

In [None]:
#output taxid and homologs for making trees

thisdata = results_df[['Taxid','gene']].drop_duplicates()
taxid_nhomologs = thisdata[['Taxid']].value_counts().reset_index()

filename = "../results/taxid_nhomologs_forEW" + today + ".csv"
taxid_nhomologs.to_csv(filename)