In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

meta_df.info()

all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

############
#Helper function
############

#file reader class

class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            try:
                self.paper_id = content['paper_id']
            except Exception as e:
                self.paper_id = ''
            self.abstract = []
            self.body_text= []
            
            # Abstract
            try:
                for entry in content['abstract']:
                    self.abstract.append(entry['text'])
            except Exception as e:
                pass
            # Body text
            
            try:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])
            except Exception as e:
                pass
            
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}:{self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)


#Helper function adds break after every words when character
# length reach to certain amount. This is for the interactive plot so 
#that hover tool fits the screen.

def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data



########################
    ##Load the Data into DataFrame
######################

#Using the helper functions, let's read in the articles into a 
#DataFrame that can be used easily:
    
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 100) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id','abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()







In [None]:
!pip install langdetect

In [None]:
dict_ = None

df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid.head()




df_covid.info()

df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)

df_covid['abstract'].describe(include='all')
df_covid['body_text'].describe(include='all')
df_covid.info()

#It looks like we didn't have duplicates. Instead, it was articles without Abstracts.


# drop Null vales:
df_covid.dropna(inplace=True)
df_covid.info()



#removing punctuation from each text
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

#convert to lower text
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))

#labelling language
from langdetect import detect
df_covid['langue'] = df_covid['title'].apply(detect)

# df_covid.to_csv(r'/kaggle/input/CORD-19-research-challenge/df_covid_lang_labels.csv', index=False)


#keeping only english language:
df_covid = df_covid.loc[df_covid['langue'] == 'en']
# df_covid.to_csv(r'/kaggle/input/CORD-19-research-challenge/df_covid_en_only.csv', index=False)


text = df_covid.drop(["paper_id", "abstract", "abstract_word_count", "body_word_count", "authors", "title", "journal", "abstract_summary", "langue"], axis=1)
text_arr = text.stack().tolist()



df_covid.info()





In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.models import Doc2Vec
import gensim

In [None]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for em in df_covid['body_text'].values:    
    all_content_train.append(LabeledSentence1(em,[j]))
    j+=1
    print("Number of texts processed: ", j)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_covid['body_text'].values)
X2 = vectorizer.transform(df_covid['body_text'].values)

In [None]:
from sklearn.decomposition import TruncatedSVD
clf = TruncatedSVD(100)
Xpca = clf.fit_transform(X)

# pca = PCA(n_components=100).fit(X2)
# datapoint = pca.transform(X2)


In [None]:
d2v_model = Doc2Vec(all_content_train, size = 100, window = 10, min_count = 500, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

In [None]:
X_doc2vec = d2v_model.docvecs.doctag_syn0
X_out = np.append(Xpca, X_doc2vec, axis=1)

In [None]:
l = kmeans_model.fit_predict(X_out)
pca = PCA(n_components=2).fit(X_out)
datapoint = pca.transform(X_out)

In [None]:

from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", len(set(labels)))

# plot
sns.scatterplot(datapoint[:, 0], datapoint[:, 1], hue=labels, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered(K-Means) - doc2vec body text")
plt.savefig("/kaggle/working/t-sne_covid19_label_TFIDF_Doc2Vec.png")
plt.show()


In [None]:
kmeans_model = KMeans(n_clusters=9, init='k-means++', max_iter=100) 
kmeans_model.fit(X_out)
labels=kmeans_model.labels_.tolist()

In [None]:
l = kmeans_model.fit_predict(Xpca)
pca = PCA(n_components=2).fit(Xpca)
datapoint = pca.transform(Xpca)

In [None]:
kmeans_model = KMeans(n_clusters=9, init='k-means++', max_iter=100) 
X = kmeans_model.fit(d2v_model.docvecs.doctag_syn0)
labels=kmeans_model.labels_.tolist()

In [None]:
print(labels)

In [None]:
l = kmeans_model.fit_predict(d2v_model.docvecs.doctag_syn0)
pca = PCA(n_components=2).fit(d2v_model.docvecs.doctag_syn0)
datapoint = pca.transform(d2v_model.docvecs.doctag_syn0)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure
# label1 = ["#FFFF00", "#008000", "#0000FF", “#800080”]
# color = [label1[i] for i in labels]
plt.scatter(datapoint[:, 0], datapoint[:, 1] )

centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c="#000000")
plt.show()

In [None]:
import gensim

def read_corpus(df, column, tokens_only=False):
    """
    Arguments
    ---------
        df: pd.DataFrame
        column: str 
            text column name
        tokens_only: bool
            wether to add tags or not
    """
    for i, line in enumerate(df[column]):
        
        tokens = gensim.parsing.preprocess_string(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


In [None]:


import random
frac_of_articles = 0.7
train_df  = df_covid.sample(frac=frac_of_articles, random_state=42)
train_corpus = (list(read_corpus(train_df, 'abstract'))) 



In [None]:


# using distributed memory model
model = gensim.models.doc2vec.Doc2Vec(dm=1, vector_size=100, min_count=2, epochs=100, seed=42, workers=3, verbose=True)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
def get_doc_vector(doc):
    tokens = gensim.parsing.preprocess_string(doc)
    vector = model.infer_vector(tokens)
    return vector

In [None]:

abstract_vectors = model.docvecs.vectors_docs
array_of_tasks = [get_doc_vector(task) for task in list_of_tasks]