In [1]:
%load_ext dotenv
%dotenv

In [2]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', -1)

In [4]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [5]:
class Preprocesser:
    def __init__(self):
        self.keywords = ['incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'quarantine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.occurances_minimum = 4
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
        self.npi_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(self.keywords), na=False, regex=True)].reset_index()
        
    def npi_slice(self):
        def get_count (row):
            return sum([row['body_text'].count(keyword) for keyword in self.keywords])
        self.df_full = self.df_full[self.df_full.apply(get_count, axis=1) > self.occurances_minimum]
        
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [6]:
def pca_apply(df, columns, n_comp):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=n_comp)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def cluster(df, columns, clust_nums):
    new_df = df.copy()
    for col in columns:
        kmeans = KMeans(n_clusters = clust_nums)
        new_df[col + "_clusterID"] = list(kmeans.fit_predict(np.stack(df[col].to_numpy())))
    return new_df


In [7]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)
(697, 9)


In [8]:
prepr.remove_punc(['body_text','abstract'])

In [9]:
prepr.remove_stopwords(['body_text', 'abstract'])

In [10]:
prepr.to_tfidf(['body_text', 'abstract'])

In [11]:
pca_df = pca_apply(prepr.df_full, ['abstract_tfidf','body_text_tfidf'], 50)

In [12]:
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [13]:
clustered_df = cluster(scaled_df, ['abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'], 5)

In [14]:
clustered_df.body_text_tfidf_pca_scaled_clusterID.value_counts()

3    375
2    273
0    19 
1    18 
4    12 
Name: body_text_tfidf_pca_scaled_clusterID, dtype: int64

In [15]:
clustered_df[clustered_df.body_text_tfidf_pca_scaled_clusterID == 2][['title']]

Unnamed: 0,title
0,"assessing spread risk of wuhan novel coronavirus within and beyond china, january-april 2020: a travel network-based modelling study"
33,spatially explicit modeling of 2019-ncov epidemic trend based on mobile phone data in mainland china
90,contacts in context: large-scale setting-specific social mixing matrices from the bbc pandemic project
124,virological assessment of hospitalized cases of coronavirus disease 2019 *equal contribution **senior authors with equal contribution
136,"estimating number of global importations of covid-19 from wuhan, risk of transmission outside mainland china and covid-19 introduction index between countries outside mainland china"
167,evaluation of the clinical characteristics of suspected or confirmed cases of covid-19 during home care with isolation: a new retrospective analysis based on o2o
169,incubation period and other epidemiological characteristics of 2019 novel coronavirus infections with right truncation: a statistical analysis of publicly available case data
186,comparative genomics reveals signature regions used to develop a robust and sensitive multiplex taqman real-time qpcr assay to detect the genus dickeya and dickeya dianthicola
193,"public exposure to live animals, behavioural change, and support in containment measures in response to covid-19 outbreak: a population-based cross sectional survey in china"
198,the impact of social distancing and epicenter lockdown on the covid-19 epidemic in mainland china: a data-driven seiqr model study


In [16]:
for i in range(15):
    print(prepr.df_full.iloc[i].total_keywords_present)

AttributeError: 'Series' object has no attribute 'total_keywords_present'

In [None]:
len(prepr.df_full)