**Installing Biopython**

In [None]:
!pip install biopython
!pip install --upgrade biopython
!pip install metapub

In [None]:
import Bio
import metapub
from metapub import PubMedFetcher
from Bio import Entrez
from Bio import SeqIO
from Bio.Entrez import efetch

In [None]:
import pandas as pd # Load the Pandas libraries with alias 'pd' 

In [None]:
# (in the same directory that the python process is based)
#pmid = pd.read_csv("/content/only pubmid 799 from ewag.csv",header=None)  # Control delimiters, rows, column names with read_csv (see later) 
#pmid = pd.read_csv("/content/combo 190 799.csv", header=None) # Read data from file 'filename.csv

In [None]:
pmid = pd.read_csv("/content/826.txt",header=None)

In [None]:
pmid = pmid.drop(pmid.columns[1], axis=1)

In [None]:
pmid.head() # Preview the first 5 lines of the loaded data 

In [None]:
pmid.columns = ['Pudmid']

In [None]:
pmid_new = pmid["Pudmid"].tolist()

In [None]:
len(pmid_new)

In [None]:
#pmid_new = "PMID31563105,PMID31549657,PMID31445334,PMID31430718"

In [None]:
#pmid_new = "31563105,31549657,31445334,31430718,31398235,31394342,31351302,31026607,30986386"

In [None]:
pmid_new = "2604392,2604394,2604396,2624462,2646661,2729985"

In [None]:
Entrez.email = "a.sakapetis@students.uu.nl"
handle = efetch(db='pubmed', id=pmid_new, retmode='text', rettype='abstract')

In [None]:
#Entrez.email = "a.sakapetis@students.uu.nl"
#handle = efetch(db='pubmed', id=pmid_new, retmode='xml', rettype='text')

In [None]:
text_from_file = handle.read()

In [None]:
file = open("190_799.txt", "w") 
file.write(str(text_from_file)) 
file.close()

In [None]:
with open("/content/clean.txt", 'rb') as corpus:
  text_full = corpus.read()

In [None]:
import nltk, re, string, unicodedata, inflect, glob, os
import re
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import numpy as np 
import pandas as pd                                 #for data manipulation and analysis
from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer 
from nltk.tokenize import RegexpTokenizer
from string import punctuation
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from','at','subject','for','of','and','re','edu','use'])

**1) FIRST STEP I WILL REMOVE NOISE**




In [None]:
erst = sent_tokenize(str(text_from_file))

In [None]:
erst

In [None]:
value = text_from_file.lower()
value = re.sub(r'[\r\n]+', ' ', value)
value = re.sub(r'[^\x00-\x7F]+', ' ', value)

tokenized = TreebankWordTokenizer().tokenize(value)
sentence = ' '.join(tokenized)
sentence = re.sub(r"\s's\b", "'s", sentence)

In [None]:
seconde = sent_tokenize(str(text_from_file))

In [None]:
df = pd.DataFrame(data = seconde)

In [None]:
df.columns = ["Sent"]

In [None]:
df.head()

In [None]:
df['Sent'] = df['Sent'].str.lower()

In [None]:
df.head()

In [None]:
df["text_lower"] = df["Sent"].str.lower()
df.head()

In [None]:
# drop the new column created in last cell
df.drop(["Sent"], axis=1, inplace=True)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["text_lower"].apply(lambda text: remove_punctuation(text))
df.head()

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
df.head()

In [None]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
df.head()

In [None]:
# Drop the two columns which are no more needed 
df.drop(["text_wo_punct", "text_wo_stop"], axis=1, inplace=True)

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df.head()

In [None]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stopfreqrare"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
#Bert

In [None]:
with open('output.txt', 'w') as f:
    for text in df['text_wo_punct'].tolist():
        f.write(text + '\n')

In [None]:
import pandas as pd

lines = []
with open("/content/output.txt", 'r') as infile:
    stripped = [line.strip() for line in infile]
    for line in stripped:
        line = "".join(line.strip())
        lines.append(line)

zeros = [0 for i in range(len(lines))]

df = pd.DataFrame(list(zip(lines, zeros)),
                  columns=['sentence', 'label'])
df.index.name = "index"

with open('sentence.tsv', 'w') as write_tsv:
    write_tsv.write(df.to_csv(sep='\t', index=True))


**Tokenize Words for SciSpacy Implementation**

In [None]:
from nltk.tokenize import word_tokenize


In [None]:
text_for_spacy = word_tokenize(str(text))

In [None]:
table = str.maketrans('', '', string.punctuation)

In [None]:
text_for_spacy_stripped = [w.translate(table) for w in text_for_spacy]

In [None]:
text_for_spacy_stripped

In [None]:
type(text_for_spacy_stripped)

In [None]:
file = open("erst.txt", "w") 
file.write(str(erst)) 
file.close()

**START SCISPACY IMPLEMENTATION**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pickle
import spacy

In [None]:
pip install scispacy spacy

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_ner_bc5cdr_md-0.2.3.tar.gz

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_ner_craft_md-0.2.3.tar.gz

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_ner_jnlpba_md-0.2.3.tar.gz

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_ner_bionlp13cg_md-0.2.3.tar.gz

In [None]:
import scispacy
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker

In [None]:
import en_ner_craft_md
import en_ner_jnlpba_md
import en_ner_bc5cdr_md

In [None]:
import en_ner_bionlp13cg_md


In [None]:
nlp = en_ner_bc5cdr_md.load()


In [None]:
#nlp.max_length = 4421830             

In [None]:
len(text_for_spacy_stripped)

In [None]:
did = set(text_for_spacy_stripped)

In [None]:
len(did)

In [None]:
doc = nlp(str(did))

In [None]:
print(doc.ents)


In [None]:
for np in doc.noun_chunks:
    print(np.text)

In [None]:
empty_list = []
for entity in doc.ents:
  print(doc.ents)
  empty_list.append(entity.text)


In [None]:
empty_list

In [None]:
for entity in doc.ents:
  print(entity.label_, ' | ', entity.text) 

In [None]:
len(set(empty_list))

In [None]:
file = open("corpus.txt", "w") 
file.write(str(erst)) 
file.close()

In [None]:
set(empty_list)

In [None]:
len(empty_list)

**Word2Vec**

In [None]:
import multiprocessing
from gensim.models import Word2Vec
import gensim 
import gensim.utils
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

In [None]:
import glob
file_names = glob.glob("/content/clean.txt")
with open("/content/clean.txt", "r", encoding = "utf-8") as f:
      print(f.read())

In [None]:
sport_text_list=[]
for file in file_names:
    try:
        with open(file, "r", encoding= "utf-8") as f:
            sport_text_list.append(f.read())
    except:
      pass

In [None]:
clean_texts = []
for text in sport_text_list:
    clean_texts.append(gensim.utils.simple_preprocess(text))

In [None]:
sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(str(fifth_step))]

In [None]:
model = gensim.models.Word2Vec(#check if works
        sentences,
        size = 150,
        window = 10,
        min_count = 5,
        workers = 10)

In [None]:
vocab = list(model.wv.vocab)
vocab

In [None]:
model = gensim.models.Word2Vec(
        clean_texts,
        size = 150,
        window = 10,
        min_count = 1,
        workers = 10)

In [None]:
model.train(clean_texts, total_examples=len(third_step), epochs=10)

In [None]:
model.save("word2vec.model")#save the model

In [None]:
model.wv.save_word2vec_format('model.bin', binary=True)#save the model


In [None]:
from collections import defaultdict


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
phrases = sent_tokenize(text_from_file)


In [None]:
print(phrases[:50])

In [None]:
line = 'Once upon a time a time this upon a'


In [None]:
line = list

In [None]:
bigrams = [b for l in phrases for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

In [None]:
print(bigrams)

In [None]:
dic = defaultdict(int)

s = list(phrases)

In [None]:
s

In [None]:
len(s)

In [None]:
s = normalize(s)

In [None]:
for i in range(0, len(s)-1):
    dic[str(s[i]) + ' ' + str(s[i+1])] += 1

In [None]:
len(dic)

In [None]:
dic

In [None]:
dic

In [None]:
len(model.wv.vocab)

In [None]:
path = get_tmpfile("/content/word2vec.model")


In [None]:
path = get_tmpfile("/content/word2vec.model")

model.wv.save_word2vec_format("/content/word2vec.txt")


In [None]:
import gzip

In [None]:

!zip -r /content/file.zip /content/word2vec.txt

In [None]:
!python -m spacy init-model en ./data/spacy.word2vec.model --vectors-loc word2vec.txt.gz

In [None]:
nlp = spacy.load('en', vectors='./data/spacy.word2vec.model/')

In [None]:
doc = nlp(str(third_step))

In [None]:
for entity in doc.ents:
  print(entity.label_, ' | ', entity.text)

#new 

In [None]:
with open("/content/USETHIS.txt", 'rb') as corpus:
  text_full = corpus.read()

In [None]:
from nltk.tokenize import word_tokenize


In [None]:
text_for_spacy = word_tokenize(str(text_full))

In [None]:
tag_list = [nltk.pos_tag(w) for w in text_for_spacy]

In [None]:
tag_list = nltk.pos_tag(text_for_spacy)

In [None]:
tag_list

In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}"


In [None]:
chunkr = nltk.RegexpParser(NP)


In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')

In [None]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd

def get_continuous_chunks(text, chunk_func=ne_chunk):
    chunked = chunk_func(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [None]:
df = pd.DataFrame({'text':['This is a foo, bar sentence with New York city.', 
                           'Another bar foo Washington DC thingy with Bruce Wayne.']})

df['text'].apply(lambda sent: get_continuous_chunks((sent)))