In [1]:
from gensim.models import Word2Vec
import pandas as pd



In [2]:
word_vectors = Word2Vec.load("word2vec_for_gene.model").wv

In [3]:
gene_disease_abstract_map = pd.read_csv('gene_disease_abstract_map_with_stems.csv')

In [4]:
def compute_cosine(row):
    disease = row.disease
    gene = row.gene
    try:
        cosine_similarity = word_vectors.similarity(disease, gene)
    except:
        try:
            cosine_similarity = word_vectors.similarity(disease, gene.lower())
        except:
            cosine_similarity = 'UNKNOWN'
    return cosine_similarity

In [5]:
gene_disease_abstract_map['cosine'] = gene_disease_abstract_map.apply(compute_cosine, axis=1)

In [6]:
gene_disease_abstract_map[gene_disease_abstract_map.cosine == 'UNKNOWN']

Unnamed: 0,title,doi,abstract,metaId,gene,disease,stems,cosine


In [7]:
word_vectors.most_similar('COVID-19')

[('test', 0.9999520182609558),
 (')', 0.9999470710754395),
 ('studi', 0.9999468326568604),
 ('infect', 0.9999461770057678),
 ('antibodi', 0.999944269657135),
 ('2', 0.9999435544013977),
 ('PRNT', 0.9999432563781738),
 ('revers', 0.9999416470527649),
 ('1', 0.9999412894248962),
 ('.', 0.9999408721923828)]

In [111]:
gene_disease_abstract_map.to_csv('gene_disease_abstract_map_with_cosine.csv', index=False)

In [8]:
gene_disease_abstract_map.describe(include='all')

Unnamed: 0,title,doi,abstract,metaId,gene,disease,stems,cosine
count,322,322,322,322.0,322,322,322,322.0
unique,192,192,192,,188,40,297,
top,A combination of four serum miRNAs for screeni...,10.1007/s13577-020-00346-6,"Serum microRNAs (miRNAs), with their noticeabl...",,PRNT,cancer,"['introduct', ':', 'the', 'express', 'specif',...",
freq,10,10,10,,26,80,5,
mean,,,,397598.782609,,,,0.999729
std,,,,280807.382259,,,,0.000244
min,,,,783.0,,,,0.997417
25%,,,,114664.0,,,,0.999674
50%,,,,543811.0,,,,0.9998
75%,,,,651015.0,,,,0.999869


In [9]:
dg_ext = pd.read_csv('../August/Dictionary/all_gene_disease_associations.tsv', sep='\t', header=0)

In [10]:
dg_ext.drop_duplicates()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.700,0.538,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,1.0,2008.0,2008.0,1,0,LHGDN
1,1,A1BG,0.700,0.538,C0002736,Amyotrophic Lateral Sclerosis,disease,C18;C10,Disease or Syndrome,0.01,1.0,2008.0,2008.0,1,0,BEFREE
2,1,A1BG,0.700,0.538,C0003578,Apnea,phenotype,C23;C08,Sign or Symptom,0.01,1.0,2017.0,2017.0,1,0,BEFREE
3,1,A1BG,0.700,0.538,C0003864,Arthritis,disease,C05,Disease or Syndrome,0.01,1.0,2019.0,2019.0,1,0,BEFREE
4,1,A1BG,0.700,0.538,C0008373,Cholesteatoma,disease,C17,Disease or Syndrome,0.01,1.0,2020.0,2020.0,1,0,BEFREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134937,115804232,CEROX1,,,C0005890,Body Height,phenotype,,Organism Attribute,0.10,1.0,2019.0,2019.0,1,0,GWASCAT
1134938,115891964,MIR223HG,0.861,0.077,C0023418,leukemia,disease,C04,Neoplastic Process,0.01,1.0,2016.0,2016.0,1,0,BEFREE
1134939,115891964,MIR223HG,0.861,0.077,C0023467,"Leukemia, Myelocytic, Acute",disease,C04,Neoplastic Process,0.01,1.0,2016.0,2016.0,1,0,BEFREE
1134940,115891964,MIR223HG,0.861,0.077,C0598766,Leukemogenesis,disease,C23;C04,Neoplastic Process,0.01,1.0,2016.0,2016.0,1,0,BEFREE


In [11]:
dg_cosine = gene_disease_abstract_map[['disease', 'gene', 'cosine', 'title', 'abstract', 'doi']]

In [12]:
dg_ext['disease'] = dg_ext.diseaseName.str.lower()
dg_ext['gene'] = dg_ext.geneSymbol

In [13]:
merged_df = dg_ext.merge(dg_cosine, on=['disease', 'gene'])

In [14]:
merged_df

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,...,YearFinal,NofPmids,NofSnps,source,disease,gene,cosine,title,abstract,doi
0,7012,TERC,0.471,0.846,C0002871,Anemia,disease,C15,Disease or Syndrome,0.1,...,,0,0,HPO,anemia,TERC,0.999758,A Case Series of TERC Variant Telomere Biology...,TERC variant telomere biology disorders (TBDs)...,10.14740/jh826
1,8847,DLEU2,0.65,0.5,C0023418,leukemia,disease,C04,Neoplastic Process,0.03,...,2014.0,3,0,BEFREE,leukemia,DLEU2,0.99976,Genomic alterations in chronic lymphocytic leu...,Background Chronic lymphocytic leukemia (CLL) ...,10.5045/br.2020.2020080
2,10866,HCP5,0.595,0.808,C0025202,melanoma,disease,C04,Neoplastic Process,0.01,...,2019.0,1,0,BEFREE,melanoma,HCP5,0.999778,A Nomogram Combining a Four-Gene Biomarker and...,BACKGROUND: Currently there is no effective pr...,10.3389/fonc.2021.593587
3,60674,GAS5,0.497,0.808,C0029456,Osteoporosis,disease,C18;C05,Disease or Syndrome,0.01,...,2019.0,1,0,BEFREE,osteoporosis,GAS5,0.999634,lncRNA GAS5 Is Upregulated in Osteoporosis and...,Background It has been reported that lncRNA gr...,10.2147/cia.s235197
4,283131,NEAT1,0.502,0.769,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,...,2016.0,1,0,BEFREE,adenocarcinoma,NEAT1,0.999818,Detection of the long noncoding RNAs nuclear-e...,OBJECTIVES Long noncoding RNAs (lncRNAs) in HI...,10.1111/hiv.12276
5,378938,MALAT1,0.435,0.846,C0007097,Carcinoma,group,C04,Neoplastic Process,0.1,...,2019.0,12,0,BEFREE,carcinoma,MALAT1,0.999829,Loss of the abundant nuclear non-coding RNA MA...,The metastasis-associated lung adenocarcinoma ...,10.4161/rna.21089
6,100048912,CDKN2B-AS1,0.491,0.769,C0017601,Glaucoma,disease,C11,Disease or Syndrome,0.15,...,2019.0,9,13,BEFREE;GWASCAT;GWASDB,glaucoma,CDKN2B-AS1,0.999848,Common Variants in CDKN2B-AS1 Associated with ...,"BACKGROUND: To date, only a small portion of t...",10.1371/journal.pone.0033389


In [15]:
dg_cosine['found_flag'] = 0
merged_df['found_flag'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dg_cosine['found_flag'] = 0


In [16]:
cosine_similarity_known_list = merged_df['cosine']

In [17]:
cosine_similarity_known_list = [float(x) for x in cosine_similarity_known_list] 

In [18]:
avg_cosine_known = sum(cosine_similarity_known_list) / len(cosine_similarity_known_list)
max_cosine_known = max(cosine_similarity_known_list)
min_cosine_known = min(cosine_similarity_known_list)

In [19]:
print("min known cosine:")
print(min_cosine_known)
print("avg known cosine:")
print(avg_cosine_known)
print("max known cosine:")
print(max_cosine_known)

min known cosine:
0.9996343851089478
avg known cosine:
0.9997752904891968
max known cosine:
0.999848484992981


In [20]:
main_df = dg_cosine.append(merged_df[['disease', 'gene', 'cosine', 'found_flag', 'title', 'abstract', 'doi']], ignore_index=True)

In [21]:
main_df = main_df.sort_values('found_flag', ascending=False).drop_duplicates(subset=['disease', 'gene'])

In [25]:
main_df

Unnamed: 0,disease,gene,cosine,title,abstract,doi,found_flag,association
328,glaucoma,CDKN2B-AS1,0.999848,Common Variants in CDKN2B-AS1 Associated with ...,"BACKGROUND: To date, only a small portion of t...",10.1371/journal.pone.0033389,1,Verified
327,carcinoma,MALAT1,0.999829,Loss of the abundant nuclear non-coding RNA MA...,The metastasis-associated lung adenocarcinoma ...,10.4161/rna.21089,1,Verified
326,adenocarcinoma,NEAT1,0.999818,Detection of the long noncoding RNAs nuclear-e...,OBJECTIVES Long noncoding RNAs (lncRNAs) in HI...,10.1111/hiv.12276,1,Verified
325,osteoporosis,GAS5,0.999634,lncRNA GAS5 Is Upregulated in Osteoporosis and...,Background It has been reported that lncRNA gr...,10.2147/cia.s235197,1,Verified
324,melanoma,HCP5,0.999778,A Nomogram Combining a Four-Gene Biomarker and...,BACKGROUND: Currently there is no effective pr...,10.3389/fonc.2021.593587,1,Verified
...,...,...,...,...,...,...,...,...
109,cancer,miR-125a-5p,0.999750,Upregulation of TRIAP1 by the lncRNA MFI2-AS1/...,Background Thyroid cancer is a very common end...,10.2147/ott.s236476,0,Medium
108,cancer,miR-148b-3p,0.999824,Downregulated exosomal microRNA-148b-3p in can...,OBJECTIVE Exosomes derived from cancer-associa...,10.1007/s13402-020-00500-0,0,High
107,cancer,RFPL3S,0.999853,Long non-coding RNA RFPL3S is a novel prognost...,Long non-coding RNAs (lncRNAs) are functional ...,10.3892/ol.2020.11642,0,High
106,syndrome,MEG3,0.999783,The value of circulating long non-coding RNA m...,OBJECTIVE This study was to evaluate the poten...,10.1002/jcla.23488,0,Medium


In [23]:
def compute_association(row):
    association_type = 'ERROR'
    found_flag = row['found_flag']
    if found_flag == 1:
        association_type = 'Verified'
    else:
        cosine = row['cosine']
        if cosine == 'UNKNOWN':
            association_type = 'Low'
        else:
            cosine = float(cosine)
            distance_from_max_known = abs(max_cosine_known - cosine)
            distance_from_avg_known = abs(avg_cosine_known - cosine)
            distance_from_min_known = abs(min_cosine_known - cosine)
            min_distance = min(distance_from_max_known, distance_from_avg_known, distance_from_min_known)
            if min_distance == distance_from_max_known:
                association_type = 'High'
            elif min_distance == distance_from_avg_known:
                association_type = 'Medium'
            elif min_distance == distance_from_min_known:
                association_type = 'Low'
    return association_type

In [26]:
main_df['association'] = main_df.apply(compute_association, axis=1)

In [27]:
main_df.to_csv("gene_disease_association_with_confidence.csv", index=False)