In [None]:
# Write function that inputs data sets (parsing and scoring). Make code that pulls
# csv file and scores it

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

@author: bkearney
email: bkearne5@uncc.edu

"""

### GENE INFO

import numpy as np
import pandas as pd
import re
import statistics 
import allel

# Read ensembl genome and human geneinfo csvs
gene = pd.read_table('gene2ensembl')
human = pd.read_table('Homo_sapiens.gene_info')

# Create Aliases column (delimited Synonyms)
aliases = human['Synonyms'].tolist()
aliases_sep = []
for i in range(len(aliases)):
    my_list = aliases[i].split("|")
    my_list.append(human['Symbol'].iloc[i])
    aliases_sep.append(my_list)
human['Aliases'] = aliases_sep

# Create geneId column from splicing/filtering dbXrefs
dbXrefs = human['dbXrefs'].tolist()
geneIds = []
for i in range(len(dbXrefs)):
    if 'Ensembl' in dbXrefs[i]:
        gene_id = dbXrefs[i].split("Ensembl:")[1]
    else:
        gene_id = ""
    geneIds.append(gene_id)
human['geneId'] = geneIds

gene.loc[gene['Ensembl_gene_identifier'] == 'ENSG00000168653']


In [15]:
def translateCSV(df,start_param,match_in,match_out):
    
    #csvFile - input file to be translated
    #start_param - name of column to be translated (ex: 'Gene name')
    #match_in - name of column in gene_info(human) to use to match
    #match_out - output column, not necessary just to match
    
    #Import file to pd dataframe, toggle header?
#     df = pd.read_csv(csvFile)
    in_list = human[match_in].tolist()
    start_param = df[start_param].tolist()
    
    out_listoflist = []
    
    for i in range(len(start_param)):
        param = start_param[i]
        if type(param)==str:
            if ';' in param:
                param=param.split(';', 1)[-1]
    
        match_list=[]
        for j in range(len(in_list)):
            if param in in_list[j]:
                index_match = human.iloc[j]
                match_list.append(index_match)
                
        out_list = []
        for k in range(len(match_list)):
            out_list.append(match_list[k][match_out])
#             out_list.append("MATCH")
#             print(match_list[k][match_out])
        out_listoflist.append(out_list)
    df['output']=out_listoflist
    
    return df
# ['#tax_id',
#  'GeneID',
#  'Symbol',
#  'LocusTag',
#  'Synonyms',
#  'dbXrefs',
#  'chromosome',
#  'map_location',
#  'description',
#  'type_of_gene',
#  'Symbol_from_nomenclature_authority',
#  'Full_name_from_nomenclature_authority',
#  'Nomenclature_status',
#  'Other_designations',
#  'Modification_date',
#  'Feature_type',
#  'Aliases']

In [108]:
def translateList(list_features, testing_df):
    
    test_col = list_features[0]
    reference_df = list_features[1]
    reference_col = list_features[2]
    output_col = list_features[3]

    start_param = testing_df[test_col].tolist()
    in_list = reference_df[reference_col].tolist()

    out_listoflist = []

    for i in range(len(start_param)):
        param = start_param[i]

        match_list=[]
        for j in range(len(in_list)):
            if str(param) in str(in_list[j]):
                index_match = reference_df.iloc[j]
                match_list.append(index_match)

        out_list = []
        for k in range(len(match_list)):
            out_list.append(match_list[k][output_col])

        out_listoflist.append(out_list)
    testing_df['output '+reference_col]=out_listoflist

    return testing_df
#     'geneId (GRCH37.66)','human','geneId','Full_name_from_nomenclature_authority'

In [5]:
def f_score(TP,FP,FN):
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f_dia = 2 * precision * recall / (precision+recall)
    return f_dia

In [119]:
def scoring(dataF, colname):

    nonmatches_ind = [ind for ind, x in enumerate(dataF[colname]) if len(x)==0 or x != x]
    matches_ind = [ind for ind, x in enumerate(dataF[colname]) if len(x)!=0 or x == x]
    nonmatches = dataF.iloc[nonmatches_ind]
    matches = dataF.iloc[matches_ind]

    truePos = len(dataF)-len(nonmatches)

    falsePos = 0 # How to calculate FP?
    falseNeg = 0
    fScore = f_score(truePos,falsePos,falseNeg)
#     return fScore
    return round(100*len(nonmatches)/len(dataF),2)

In [120]:


esm = pd.read_csv('13059_2019_1621_MOESM2_ESM.csv')
esm_short = esm.loc[0:1000,:]
# esm = translateCSV(esm_short,'gene_symbol','Aliases','Full_name_from_nomenclature_authority')
# sample_score = scoring(esm_short, 'output')
# print(sample_score, "% recall rate")

esm_list = ['gene_symbol',human,'Aliases','Full_name_from_nomenclature_authority']
esm_new = translateList(esm_list, esm_short)
score_esm = scoring(esm_new, 'output Aliases')
score_esm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


25.57

In [121]:
# Import csv/excel file
dia_df = pd.read_excel('NIHMS795012-supplement-supp_table20.xlsx',header=2)
dia_df = dia_df.fillna(method='ffill')

# OLD METHOD
# dia = translateCSV(dia_df,'geneId (GRCh37.66)','geneId','Full_name_from_nomenclature_authority')

# NEW METHOD (by column)
# Enter list of columns to be translated (as many as possible)
# Each entry is it's own list with 4 elements: 
# column name, reference dataset, reference column name, output column to be matched 

geneID_list = ['geneId (GRCh37.66)',human,'geneId','Full_name_from_nomenclature_authority']
symbol_list = ['Gene name',human,'Aliases','Full_name_from_nomenclature_authority']
geneID_g2E_list = ['Gene name',gene,'Ensembl_gene_identifier','Gene name']

dia = translateList(geneID_list, dia_df)
dia = translateList(symbol_list, dia_df)
# dia = translateList(geneID_g2E_list, dia_df)
score1 = scoring(dia, 'output geneId')
score2 = scoring(dia, 'output Aliases')



Unnamed: 0,Interval name (biological candidate or nearest gene),GWAS index SNP(s) (population(s) in which association was reported),Chr,"Interval start (bp, hg19)","Interval end (bp, hg19)",Gene name,geneId (GRCh37.66),"Gene start (bp, hg19)","Gene end (bp, hg19)",output geneId,output Aliases
0,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,NDUFS5,ENSG00000168653,39491989.0,39500308.0,[NADH:ubiquinone oxidoreductase subunit S5],"[NADH:ubiquinone oxidoreductase subunit S5, NA..."
1,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,PABPC4,ENSG00000090621,40026487.0,40042462.0,[poly(A) binding protein cytoplasmic 4],"[poly(A) binding protein cytoplasmic 4, poly(A..."
2,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,NT5C1A,ENSG00000116981,40124792.0,40137710.0,"[5'-nucleotidase, cytosolic IA]","[5'-nucleotidase, cytosolic IA]"
3,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,OXCT2,ENSG00000198754,40235194.0,40237020.0,[3-oxoacid CoA-transferase 2],"[3-oxoacid CoA-transferase 2, 3-oxoacid CoA-tr..."
4,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,MYCL1,ENSG00000116990,40361097.0,40367928.0,"[MYCL proto-oncogene, bHLH transcription factor]","[MYCL proto-oncogene, bHLH transcription facto..."
5,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,MYCBP,ENSG00000214114,39328635.0,39347289.0,[MYC binding protein],"[MYC binding protein 2, MYC binding protein, M..."
6,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,GJA9,ENSG00000131233,39330174.0,39347289.0,[gap junction protein alpha 9],"[gap junction protein delta 2, gap junction pr..."
7,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,RHBDL2,ENSG00000158315,39351478.0,39407471.0,[rhomboid like 2],[rhomboid like 2]
8,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,AKIRIN1,ENSG00000174574,39456894.0,39471731.0,[akirin 1],"[akirin 1, akirin 1 pseudogene 1, akirin 1 pse..."
9,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,HEYL,ENSG00000163909,40089824.0,40105617.0,[hes related family bHLH transcription factor ...,[hes related family bHLH transcription factor ...


In [122]:
print(score1,score2)
dia

1.57 0.94


Unnamed: 0,Interval name (biological candidate or nearest gene),GWAS index SNP(s) (population(s) in which association was reported),Chr,"Interval start (bp, hg19)","Interval end (bp, hg19)",Gene name,geneId (GRCh37.66),"Gene start (bp, hg19)","Gene end (bp, hg19)",output geneId,output Aliases
0,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,NDUFS5,ENSG00000168653,39491989.0,39500308.0,[NADH:ubiquinone oxidoreductase subunit S5],"[NADH:ubiquinone oxidoreductase subunit S5, NA..."
1,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,PABPC4,ENSG00000090621,40026487.0,40042462.0,[poly(A) binding protein cytoplasmic 4],"[poly(A) binding protein cytoplasmic 4, poly(A..."
2,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,NT5C1A,ENSG00000116981,40124792.0,40137710.0,"[5'-nucleotidase, cytosolic IA]","[5'-nucleotidase, cytosolic IA]"
3,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,OXCT2,ENSG00000198754,40235194.0,40237020.0,[3-oxoacid CoA-transferase 2],"[3-oxoacid CoA-transferase 2, 3-oxoacid CoA-tr..."
4,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,MYCL1,ENSG00000116990,40361097.0,40367928.0,"[MYCL proto-oncogene, bHLH transcription factor]","[MYCL proto-oncogene, bHLH transcription facto..."
5,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,MYCBP,ENSG00000214114,39328635.0,39347289.0,[MYC binding protein],"[MYC binding protein 2, MYC binding protein, M..."
6,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,GJA9,ENSG00000131233,39330174.0,39347289.0,[gap junction protein alpha 9],"[gap junction protein delta 2, gap junction pr..."
7,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,RHBDL2,ENSG00000158315,39351478.0,39407471.0,[rhomboid like 2],[rhomboid like 2]
8,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,AKIRIN1,ENSG00000174574,39456894.0,39471731.0,[akirin 1],"[akirin 1, akirin 1 pseudogene 1, akirin 1 pse..."
9,MACF1,rs2296172 (European),1.0,39251621.0,40389945.0,HEYL,ENSG00000163909,40089824.0,40105617.0,[hes related family bHLH transcription factor ...,[hes related family bHLH transcription factor ...
