In [1]:
%load_ext dotenv
%dotenv

In [2]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', -1)

In [4]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [5]:
class Preprocesser:
    def __init__(self):
        self.keywords = [
            'nonpharmaceutical intervention',
            'handwashing',
            'wipe down',
            'public health',
            'social behavior',
            'community spread',
            'containment',
            'business closures',
            'basic reproduction number',
            'infection attack rate',
            'lockdown',
            'disease natural history',
            'incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'quarantine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.occurances_minimum = 4
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
        self.npi_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(self.keywords), na=False, regex=True)].reset_index()
        
    def npi_slice(self):
        def get_count(row):
            return sum([row['body_text'].count(keyword) for keyword in self.keywords])
        self.df_full = self.df_full[self.df_full.apply(get_count, axis=1) > self.occurances_minimum]
        
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [6]:
def pca_apply(df, columns, n_comp):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=n_comp)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def cluster(df, columns, clust_nums):
    new_df = df.copy()
    for col in columns:
        kmeans = KMeans(n_clusters = clust_nums)
        new_df[col + "_clusterID"] = list(kmeans.fit_predict(np.stack(df[col].to_numpy())))
    return new_df


In [7]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)
(1157, 9)


In [8]:
prepr.remove_punc(['body_text','abstract'])

In [9]:
prepr.remove_stopwords(['body_text', 'abstract'])

In [10]:
prepr.to_tfidf(['body_text', 'abstract'])

In [11]:
pca_df = pca_apply(prepr.df_full, ['abstract_tfidf','body_text_tfidf'], 50)

In [12]:
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [13]:
clustered_df = cluster(scaled_df, ['abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'], 10)

In [14]:
clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts()

1    760
5    147
0    54 
9    44 
8    40 
4    37 
3    30 
7    19 
2    14 
6    12 
Name: body_text_tfidf_pca_scaled_clusterID, dtype: int64

In [15]:
clustered_df[clustered_df.body_text_tfidf_pca_scaled_clusterID == 6][['title']]

Unnamed: 0,title
445,"lack of cross-protection against mycoplasma haemofelis infection and signs of enhancement in ""candidatus mycoplasma turicensis""-recovered cats"
518,isolation and identification of feline herpesvirus type 1 from a south china tiger in china
623,environmental contamination and hygienic measures after feline calicivirus field strain infections of cats in a research facility
870,feline calicivirus and other respiratory pathogens in cats with feline calicivirus- related symptoms and in clinically healthy cats in switzerland
1415,co-infection with feline retrovirus is related to changes in immunological parameters of cats with sporotrichosis
2022,infectious disease prevalence and factors associated with upper respiratory infection in cats following relocation
2173,stimulation with a class a cpg oligonucleotide enhances resistance to infection with feline viruses from five different families
2211,reversal of the progression of fatal coronavirus infection in cats by a broad- spectrum coronavirus protease inhibitor
2706,retroviral dna-the silent winner: blood transfusion containing latent feline leukemia provirus causes infection and disease in naïve recipient cats
2803,highly suspected cases of salmonellosis in two cats fed with a commercial raw meat- based diet: health risks to animals and zoonotic implications


In [16]:
len(prepr.df_full)

1157

In [86]:
## authors do not seem to be repeating within their clusters
# print(len(meta_arr[4]['cluster']))
# meta_arr[4]['cluster'].author_list.value_counts()

In [322]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import re

[nltk_data] Downloading package punkt to /Users/bvs002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [335]:
## Explore Clusters
class Explore_Cluster:
    def __init__(self, clusetered_df):
        self.clustered_df = clustered_df
        self.cluster_count = len(clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts())
        self.clusters = [pd.DataFrame([row[1] for row in clustered_df.iterrows() if row[1].body_text_tfidf_pca_scaled_clusterID == i]) for i in range (self.cluster_count)]
        self.meta_arr = [{ 'size': len(cluster_df), 'cluster': cluster_df } for cluster_df in self.clusters]
        self.top_limit = 30
        self.top_title_tokens = self.get_top_title_tokens()
        
        for meta in self.meta_arr:
            meta['keywords'] = [word for word in self.tokenize_and_clean(meta['cluster'].title) if word not in self.top_title_tokens]
            meta['keywords'] = [value[0] for value in Counter(meta['keywords']).most_common()[0:5]]
            
        
    def get_top_title_tokens(self):
        all_title_tokens = []
        for meta in self.meta_arr:
            all_title_tokens.extend(self.tokenize_and_clean(meta['cluster'].title))
        return [value[0] for value in Counter(all_title_tokens).most_common()[0:self.top_limit]]
        
    def tokenize_and_clean(self, titles):
        title_tokens = []
        for title in titles:
            title = re.sub('(/|\|:|&|#|-|\.)', '', title)
            tokens = word_tokenize(title)
            remove_sw = [word for word in tokens if word not in stopwords.words('english')]
            remove_numbers = [word for word in remove_sw if not word.isnumeric()]
            remove_comas = [word for word in remove_numbers if not word in [',', '(', ')', '"', ':', '``', '.', '?']]
            title_tokens.extend(remove_comas)
        return title_tokens    
        
            

In [336]:
ec = Explore_Cluster(clustered_df)

In [337]:
for i in range(len(ec.meta_arr)):
    print(ec.meta_arr[i]['keywords'])
    print('-------------------------')

['2019ncov', 'cases', 'title', 'based', 'interval']
-------------------------
['middle', 'east', 'detection', 'patients', 'acute']
-------------------------
['porcine', 'diarrhea', 'isolation', 'swine', 'spraydried']
-------------------------
['antibody', 'antibodies', 'vaccines', 'cell', 'vaccine']
-------------------------
['avian', 'global', 'h7n9', 'h1n1', 'markets']
-------------------------
['model', 'west', 'africa', 'outbreaks', 'preparedness']
-------------------------
['cats', 'feline', 'calicivirus', 'mycoplasma', 'isolation']
-------------------------
['bronchitis', 'avian', 'detection', 'recombinant', 'isolated']
-------------------------
['bats', 'zoonotic', 'pathogens', 'bat', 'potential']
-------------------------
['data', 'estimation', '2019ncov', 'clinical', 'medicine']
-------------------------


In [319]:
print(ec.top_title_tokens)

['health', 'virus', 'disease', 'infectious', 'influenza', 'respiratory', 'infection', 'coronavirus', 'transmission', 'public', 'pandemic', 'outbreak', 'china', 'diseases', 'novel', 'epidemic', 'ebola', 'covid-19', 'sars', 'analysis']
