In [1]:
%load_ext dotenv
%dotenv

In [2]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', -1)

In [4]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [5]:
class Preprocesser:
    def __init__(self):
        self.keywords = [
            'nonpharmaceutical intervention',
            'handwashing',
            'wipe down',
            'public health',
            'social behavior',
            'community spread',
            'containment',
            'business closures',
            'basic reproduction number',
            'infection attack rate',
            'lockdown',
            'disease natural history',
            'incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'quarantine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.occurances_minimum = 4
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
        self.npi_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(self.keywords), na=False, regex=True)].reset_index()
        
    def npi_slice(self):
        def get_count(row):
            return sum([row['body_text'].count(keyword) for keyword in self.keywords])
        self.df_full = self.df_full[self.df_full.apply(get_count, axis=1) > self.occurances_minimum]
        
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [6]:
def pca_apply(df, columns, n_comp):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=n_comp)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def cluster(df, columns, clust_nums):
    new_df = df.copy()
    for col in columns:
        kmeans = KMeans(n_clusters = clust_nums)
        new_df[col + "_clusterID"] = list(kmeans.fit_predict(np.stack(df[col].to_numpy())))
    return new_df


In [7]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)
(1157, 9)


In [8]:
prepr.remove_punc(['body_text','abstract'])

In [9]:
prepr.remove_stopwords(['body_text', 'abstract'])

In [10]:
prepr.to_tfidf(['body_text', 'abstract'])

In [11]:
pca_df = pca_apply(prepr.df_full, ['abstract_tfidf','body_text_tfidf'], 50)

In [12]:
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [13]:
clustered_df = cluster(scaled_df, ['abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'], 10)

In [14]:
clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts()

6    403
7    394
1    89 
5    87 
4    62 
3    46 
0    34 
8    22 
2    14 
9    6  
Name: body_text_tfidf_pca_scaled_clusterID, dtype: int64

In [15]:
clustered_df[clustered_df.body_text_tfidf_pca_scaled_clusterID == 6][['title']]

Unnamed: 0,title
11,substantial undocumented infection facilitates the rapid dissemination of novel coronavirus (covid-19)
33,spatially explicit modeling of 2019-ncov epidemic trend based on mobile phone data in mainland china
35,real-time estimation of the risk of death from novel coronavirus (covid-19) infection: inference using exported cases
47,estimation of the epidemic properties of the 2019 novel coronavirus: a mathematical modeling study
49,"title: the novel coronavirus, 2019-ncov, is highly contagious and more infectious than initially estimated"
104,g-quadruplex stabilization in the ions and maltose transporters inhibit salmonella enterica growth and virulence
134,early dynamics of transmission and control of covid-19: a mathematical modelling study
136,"estimating number of global importations of covid-19 from wuhan, risk of transmission outside mainland china and covid-19 introduction index between countries outside mainland china"
142,title: incorporating human movement data to improve epidemiological estimates for 2019-ncov
164,title analysis of the epidemic growth of the early 2019-ncov outbreak using internationally confirmed cases


In [16]:
len(prepr.df_full)

1157

In [17]:
## authors do not seem to be repeating within their clusters
# print(len(meta_arr[4]['cluster']))
# meta_arr[4]['cluster'].author_list.value_counts()

In [18]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import re

[nltk_data] Downloading package punkt to /Users/bvs002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
## Explore Clusters
class Explore_Cluster:
    def __init__(self, clusetered_df):
        self.clustered_df = clustered_df
        self.cluster_count = len(clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts())
        self.clusters = [pd.DataFrame([row[1] for row in clustered_df.iterrows() if row[1].body_text_tfidf_pca_scaled_clusterID == i]) for i in range (self.cluster_count)]
        self.meta_arr = [{ 'size': len(cluster_df), 'cluster': cluster_df } for cluster_df in self.clusters]
        self.top_limit = 30
        self.top_title_tokens = self.get_top_title_tokens()
        
        for meta in self.meta_arr:
            meta['keywords'] = [word for word in self.tokenize_and_clean(meta['cluster'].title) if word not in self.top_title_tokens]
            meta['keywords'] = [value[0] for value in Counter(meta['keywords']).most_common()[0:5]]
            
        
    def get_top_title_tokens(self):
        all_title_tokens = []
        for meta in self.meta_arr:
            all_title_tokens.extend(self.tokenize_and_clean(meta['cluster'].title))
        return [value[0] for value in Counter(all_title_tokens).most_common()[0:self.top_limit]]
        
    def tokenize_and_clean(self, titles):
        title_tokens = []
        for title in titles:
            title = re.sub('(/|\|:|&|#|-|\.)', '', title)
            tokens = word_tokenize(title)
            remove_sw = [word for word in tokens if word not in stopwords.words('english')]
            remove_numbers = [word for word in remove_sw if not word.isnumeric()]
            remove_comas = [word for word in remove_numbers if not word in [',', '(', ')', '"', ':', '``', '.', '?']]
            title_tokens.extend(remove_comas)
        return title_tokens    
        
            

In [20]:
ec = Explore_Cluster(clustered_df)

In [24]:
for i in range(len(ec.meta_arr)):
    print(f'cluster: {i}')
    print(ec.meta_arr[i]['keywords'])
    print('-------------------------')

cluster: 0
['2019ncov', 'severe', 'acute', 'sarscov2', 'clinical']
-------------------------
cluster: 1
['preparedness', 'h1n1', 'bmc', 'crosssectional', 'interventions']
-------------------------
cluster: 2
['porcine', 'diarrhea', 'isolation', 'swine', 'spraydried']
-------------------------
cluster: 3
['zoonotic', 'global', 'one', 'antimicrobial', 'resistance']
-------------------------
cluster: 4
['2019ncov', 'cases', 'title', 'wuhan', 'based']
-------------------------
cluster: 5
['cells', 'cell', 'isolation', 'primary', 'epithelial']
-------------------------
cluster: 6
['patients', 'middle', 'east', 'clinical', 'isolation']
-------------------------
cluster: 7
['surveillance', 'emergency', 'case', 'acute', 'global']
-------------------------
cluster: 8
['africa', 'west', 'model', 'patients', 'preparedness']
-------------------------
cluster: 9
['zika', 'detection', 'reverse', 'genetic', 'biological']
-------------------------


In [22]:
print(ec.top_title_tokens)

['health', 'virus', 'disease', 'infectious', 'influenza', 'respiratory', 'infection', 'coronavirus', 'transmission', 'public', 'pandemic', 'outbreak', 'china', 'diseases', 'novel', 'epidemic', 'ebola', 'sars', 'covid19', 'response', 'human', 'analysis', 'syndrome', 'viruses', 'study', 'emerging', 'control', 'viral', 'review', 'risk']
