In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/harry/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/harry/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/harry/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [21]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
import pandas as pd
import numpy as np
import sqlite3
import dill as pickle

In [23]:
con = sqlite3.connect("../cord.db")
df = pd.read_sql_query("SELECT title, abstract, authors, body_text FROM cord19 ", con)

In [24]:
df.head()

Unnamed: 0,title,abstract,authors,body_text
0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,"Madani, Tariq A; Al-Ghamdi, Aisha A",
1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,"Vliet, Albert van der; Eiserich, Jason P; Cros...",
2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,"Crouch, Erika C",
3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",
4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",


In [25]:
# tokenizer = nltk.RegexpTokenizer(r"\w+")
# documents = df['abstract'].str.lower().apply(tokenizer.tokenize)

In [26]:
# nltk_stop_words = nltk.corpus.stopwords.words('english')
# documents_without_stop_words = []
# for document in documents :
#     documents_without_stop_words.append([word for word in document if word not in nltk_stop_words])

In [27]:
# wordnet_lemmatizer = WordNetLemmatizer()
# for i, document in enumerate(documents_without_stop_words) :
#     documents_without_stop_words[i] = [wordnet_lemmatizer.lemmatize(word) for word in document]

In [28]:
titles = df['title']
authors = df['authors']
abstracts = df['abstract']
body_texts = df['body_text']

In [29]:
def preprocess_text(text):
    tokenizer = nltk.RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokenizer.tokenize(text)
 
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in nltk_stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens]

    return tokens

In [30]:
vectorizer = TfidfVectorizer(analyzer=preprocess_text, min_df=40)

In [31]:
document_tf_idf_fit = vectorizer.fit(titles + abstracts + body_texts)

title_tf_idf = vectorizer.transform(titles)
document_tf_idf = vectorizer.transform(abstracts + body_texts)

In [32]:
feature_names = vectorizer.get_feature_names()
dense = document_tf_idf.todense().tolist()
tfidf = pd.DataFrame(dense, columns=feature_names)
tfidf.head()

Unnamed: 0,a,ability,able,absence,abundance,abundant,access,accompanied,according,account,...,x,y,year,yeast,yet,yield,young,zika,zikv,zoonotic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045744,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
with open('../data/vectorizer.pickle', 'wb') as file:
    pickle.dump(vectorizer, file)

In [34]:
with open('../data/title_tf_idf.pickle', 'wb') as file:
    pickle.dump(title_tf_idf, file)
    
with open('../data/document_tf_idf.pickle', 'wb') as file:
    pickle.dump(document_tf_idf, file)