In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
import multiprocessing
from gensim.models import Word2Vec

In [83]:
def preprocess_apply(row):
    disease = row.disease
    gene = row.gene
    text = row.abstract

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w == 'no' or w == 'not' or not w in stop_words]
    porter = PorterStemmer()
    stems = []
    for t in tokens:
        if t == disease:
            stems.append(t)
        elif t == gene:
            stems.append(t)
        else:
            try:
                stems.append(porter.stem(t.lower()))
            except:
                pass
    return stems

In [51]:
gene_disease_abstract_map = pd.read_csv('gene_disease_abstract_map.csv')

In [71]:
gene_disease_abstract_map['gene'] = gene_disease_abstract_map['gene'].str.strip()
gene_disease_abstract_map['disease'] = gene_disease_abstract_map['disease'].str.strip()

In [75]:
gene_disease_abstract_map = gene_disease_abstract_map[gene_disease_abstract_map.disease != 'disease']

In [84]:
gene_disease_abstract_map['stems'] = gene_disease_abstract_map.apply(preprocess_apply, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_disease_abstract_map['stems'] = gene_disease_abstract_map.apply(preprocess_apply, axis=1)


In [85]:
gene_disease_abstract_map

Unnamed: 0,title,doi,abstract,metaId,gene,disease,stems
0,Common Variants in CDKN2B-AS1 Associated with ...,10.1371/journal.pone.0033389,"BACKGROUND: To date, only a small portion of t...",783,CDKN2B-AS1,glaucoma,"[background, :, to, date, ,, small, portion, g..."
1,The lncRNA Malat1 functions as a ceRNA to cont...,10.1038/s41401-019-0284-y,Long non-coding RNAs (lncRNAs) have been ident...,12926,miR-181c-5p,adenocarcinoma,"[long, non-cod, rna, (, lncrna, ), identifi, e..."
2,No evidence for sylvatic cycles of chikungunya...,10.1186/s13071-020-04419-1,"BACKGROUND: Dengue, chikungunya and Zika virus...",13550,PRNT,chikungunya,"[background, :, dengu, ,, chikungunya, zika, v..."
3,Clinical implications and nomogram prediction ...,10.1097/md.0000000000022806,"Colorectal cancer, especially colon adenocarci...",13556,FRGCA,adenocarcinoma,"[colorect, cancer, ,, especi, colon, adenocarc..."
4,Droplet Digital PCR Analysis of Liquid Biopsy ...,10.3390/biology9110379,SIMPLE SUMMARY: Despite the availability of sc...,14242,hsa-miR-375-3p,cancer,"[simpl, summari, :, despit, avail, screen, pro..."
...,...,...,...,...,...,...,...
405,Transcriptomic Analysis Reveals Host miRNAs Co...,10.3390/microorganisms9030665,Ebola virus is a continuing threat to human po...,764184,hsa-miR-122-5p,lymphopenia,"[ebola, viru, continu, threat, human, popul, ,..."
406,Transcriptomic Analysis Reveals Host miRNAs Co...,10.3390/microorganisms9030665,Ebola virus is a continuing threat to human po...,764184,hsa-miR-125b-5p,lymphopenia,"[ebola, viru, continu, threat, human, popul, ,..."
409,miRNA-200c-3p is crucial in acute respiratory ...,10.1038/celldisc.2017.21,Influenza infection and pneumonia are known to...,764237,miR-200c-3p,influenza,"[influenza, infect, pneumonia, known, caus, mu..."
410,miRNA-200c-3p is crucial in acute respiratory ...,10.1038/celldisc.2017.21,Influenza infection and pneumonia are known to...,764237,miR-200c-3p,syndrome,"[influenza, infect, pneumonia, known, caus, mu..."


In [86]:
sentences = gene_disease_abstract_map.stems.tolist()

In [33]:
import numpy as np

In [35]:
l = gene_disease_abstract_map.stems.values.flatten().tolist()
flat_l = [e for v in l for e in v]
print(flat_l)

['background', ':', 'to', 'date', ',', 'small', 'portion', 'genet', 'variat', 'primari', 'open-angl', 'glaucoma', '(', 'poag', ')', ',', 'major', 'type', 'glaucoma', ',', 'elucid', '.', 'method', 'and', 'princip', 'find', ':', 'we', 'examin', 'two', 'data', 'set', 'genome-wid', 'associ', 'studi', '(', 'gwa', ')', 'deriv', 'total', '2,219', 'japanes', 'subject', '.', 'first', ',', 'perform', 'gwa', 'analyz', '653,519', 'autosom', 'common', 'single-nucleotid', 'polymorph', '(', 'snp', ')', '833', 'poag', 'patient', '686', 'control', '.', 'as', 'result', ',', 'five', 'variant', 'pass', 'bonferroni', 'correct', 'identifi', 'cdkn2b-as1', 'chromosom', '9p21.3', ',', 'alreadi', 'report', 'signific', 'locu', 'caucasian', 'popul', '.', 'moreov', ',', 'combin', 'data', 'set', 'previou', 'gwa', 'data', 'set', 'deriv', '411', 'poag', 'patient', '289', 'control', 'mantel-haenszel', 'test', ',', 'combin', 'variant', 'show', 'stronger', 'associ', 'poag', '(', 'p', '<', '5.8×10', '(', '−10', ')', ')',

In [87]:
w2v_model = Word2Vec(min_count=1, window=4, vector_size=300, sample=1e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=multiprocessing.cpu_count() - 1)

In [88]:
w2v_model.build_vocab(sentences, progress_per=50000)

In [89]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(129891, 668250)

In [57]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [90]:
w2v_model.save("word2vec_for_gene.model")

In [91]:
gene_disease_abstract_map.to_csv('gene_disease_abstract_map_with_stems.csv', index=False)

In [92]:
print('cdkn2b-as1' in flat_l)

True
