In [190]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [219]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage


In [192]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [193]:
class Preprocesser:
    def __init__(self):
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        keywords = ['incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'qarentine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(keywords), na=False, regex=True)].reset_index()
    
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [239]:
def pca_apply(df, columns):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=50)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def cluster(df, columns, clust_nums):
    new_df = df.copy()
    for col in columns:
        kmeans = KMeans(n_clusters = clust_nums)
        new_df[col + "_clusterID"] = list(kmeans.fit_predict(np.stack(df[col].to_numpy())))
    return new_df


In [195]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)


In [196]:
prepr.remove_punc(['body_text','abstract'])

In [197]:
prepr.remove_stopwords(['body_text', 'abstract'])

In [198]:
prepr.to_tfidf(['body_text', 'abstract'])

In [241]:
pca_df = pca_apply(prepr.df_full, ['abstract_tfidf','body_text_tfidf'])

In [244]:
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [245]:
clustered_df = cluster(scaled_df, ['abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'], 50)

In [247]:
clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts()

21    661
47    202
18    172
12    124
39    109
15    102
38    101
48     98
1      98
36     96
41     94
0      88
17     86
43     86
42     85
49     82
8      76
5      76
23     72
3      72
24     70
46     69
11     69
4      68
2      66
37     66
20     61
28     61
32     57
44     52
45     52
30     51
14     47
22     45
35     43
13     43
31     40
27     38
34     35
19     32
26     29
40     25
10     24
25     24
16     23
7      20
6      19
29     16
9      13
33     13
Name: body_text_tfidf_pca_scaled_clusterID, dtype: int64

In [253]:
clustered_df[clustered_df.body_text_tfidf_pca_scaled_clusterID == 12].title

219     revitalization of integrated disease surveilla...
241     correspondence to: communicable disease contro...
357       incorporating one health into medical education
405     uganda public health fellowship program's cont...
419     acquired immunity and asymptomatic reservoir i...
434     european monitoring systems and data for asses...
483     between securitisation and neglect: managing e...
539     evaluation of applied public health emergency ...
615     the past, present, and future of public health...
639                                                      
664     building international genomics collaboration ...
692     strengthening national disease surveillance an...
728     pandemic influenza preparedness and health sys...
731     comparing national infectious disease surveill...
766     what makes health systems resilient against in...
774                                                      
848     promoting public health legal preparedness for...
852     cyberc