In [1]:
%load_ext dotenv
%dotenv

In [2]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage


In [28]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', -1)

In [3]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [4]:
class Preprocesser:
    def __init__(self):
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        keywords = ['incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'quarentine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(keywords), na=False, regex=True)].reset_index()
    
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [36]:
def pca_apply(df, columns, n_comp):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=n_comp)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def cluster(df, columns, clust_nums):
    new_df = df.copy()
    for col in columns:
        kmeans = KMeans(n_clusters = clust_nums)
        new_df[col + "_clusterID"] = list(kmeans.fit_predict(np.stack(df[col].to_numpy())))
    return new_df


In [6]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)


In [7]:
prepr.remove_punc(['body_text','abstract'])

In [8]:
prepr.remove_stopwords(['body_text', 'abstract'])

In [9]:
prepr.to_tfidf(['body_text', 'abstract'])

In [10]:
pca_df = pca_apply(prepr.df_full, ['abstract_tfidf','body_text_tfidf'], 50)

In [11]:
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [37]:
clustered_df = cluster(scaled_df, ['abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'], 5)

In [38]:
clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts()

1    3464
0    306 
4    30  
3    27  
2    24  
Name: body_text_tfidf_pca_scaled_clusterID, dtype: int64

In [41]:
clustered_df[clustered_df.body_text_tfidf_pca_scaled_clusterID == 2][['title']]

Unnamed: 0,title
744,clinical and molecular investigation of a canine distemper outbreak and vector-borne infections in a group of rescue dogs imported from hungary to switzerland
780,a monoclonal antibody-based copro-elisa kit for canine echinococcosis to support the paho effort for hydatid disease control in south america
862,the serological and virological investigation of canine adenovirus infection on the dogs
927,performance of lbsap vaccine after intradermal challenge with l. infantum and saliva of lu. longipalpis: immunogenicity and parasitological evaluation
1021,experimental infection of dogs with a feline endogenous retrovirus rd-114
1049,demographics and economic burden of un-owned cats and dogs in the uk: results of a 2010 census stavisky et al. demographics and economic burden of un-owned cats and dogs in the uk: results of a 2010 census
1674,"detection and genetic characterization of canine astroviruses in pet dogs in guangxi, china"
1811,"genotyping and pathobiologic characterization of canine parvovirus circulating in nanjing, china"
2637,preliminary studies on isolates of clostridium difficile from dogs and exotic pets
2836,the fecal microbiota and unconjugated fecal bile acids in dogs with diabetes mellitus escg-o-2 impact of antibiotic administration on fecal bacterial groups potentially associated with dysbiosis in kittens escg-o-3 fecal microbial metabolism is altered in dogs with chronic enteropathy escg-o-4 the pug breed demonstrates a worse response to treatment of protein-losing enteropathy than other breeds of dog escg-o-6 dogs with acute haemorrhagic diarrhoea syndrome not receiving antibiotics have a good prognosis despite initial high ahds-score and systemic inflammation escg-o-7 faecal bile acid profiles in dogs with acute haemorrhagic diarrhoea syndrome over time and compared to healthy dogs escg-o-8 long-term consequences of acute hemorrhagic diarrhea syndrome in dogs esvc-o-1 acute effect of oral pimobendan on left atrial function and mitral valve regurgitation severity in dogs with stage b2 myxomatous mitral valve disease -a pilot study esvc-o-2 retrospective evaluation of the safety and tolerability of pimobendan in cats with obstructive versus nonobstructive hypertrophic cardiomyopathy
