In [50]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [104]:
# get raw data from GCP bucket
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
from datetime import datetime

from google.cloud import storage


In [111]:
# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
JSON_FILE = 'all_json_new.pkl'

In [130]:
class Preprocesser:
    def __init__(self):
        storage_client = storage.Client()
        temp = tempfile.TemporaryFile()
        storage_client.download_blob_to_file(f"gs://{BUCKET_NAME}/{JSON_FILE}", temp)
        self.df_full = pd.read_pickle(temp, compression=None)
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
    
    def key_slice(self):
        keywords = ['incident command system',
            'emergency operations',
            'joint information center',
            'social distancing',
            'childcare closers',
            'travel advisory',
            'travel warning',
            'isolation',
            'qarentine',
            'mass gathering cancellations',
            'school closures',
            'facility closures',
            'facility closures',
            'evacuation',
            'relocation',
            'restricing travel',
            'travel ban',
            'patient cohort',
            'npi']
        self.df_full = self.df_full[self.df_full['body_text'].str.contains('|'.join(keywords), na=False, regex=True)].reset_index()
    
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = tfidfv.fit_transform(self.df_full[col])
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^\w\s]','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [131]:
prepr = Preprocesser()

(3851, 8)
(3851, 9)


In [125]:
# prepr.df_full

In [123]:
prepr.remove_stopwords(['abstract'])

In [126]:
prepr.remove_punc(['abstract'])

In [127]:
prepr.df_full

Unnamed: 0,author_list,body_text,paper_id,title,sha,abstract,publish_time,doi
0,"[Shengjie Lai, Isaac I Bogoch, Nick W Ruktanon...","in december 2019, a cluster of patients with p...",013d9d1cba8a54d5d3718c229b812d7cf91b6c89,assessing spread risk of wuhan novel coronavir...,,,,
1,"[Giancarlo De Luca, Kim Van Kerckhove, Pietro ...",they affect seasonal influenza dynamics. the m...,02201e4601ab0eb70b6c26480cf2bfeae2625193,the impact of regular school closure on season...,,,,
2,"[Evans K Lodge, Annakate M Schatz, John M Drake]","estimation is complicated, however, by efforts...",029c1c588047f1d612a219ee15494d2d19ff7439,protective population behavior change in outbr...,029c1c588047f1d612a219ee15494d2d19ff7439,outbreaks emerging infections lack effective d...,2020-01-28,doi.org/10.1101/2020.01.27.921536
3,"[Ru Liu, Xiaoyan Ming, Ou Xu, Jianli Zhou, Hui...","as clinic experiences increases, clinicians be...",0562f70516579d557cd1486000bb7aac5ccec2a1,association of cardiovascular manifestations w...,,,,
4,"[Leon Danon, Ellen Brooks-Pollock, Mick Bailey...","an outbreak of a novel coronavirus, recently r...",05d99c07db59b6948e39bfa62c2cbbf62944059a,a spatial model of covid-19 transmission in en...,,,,
5,"[Laurent Hébert-Dufresne, Benjamin M Althouse,...",the basic reproductive number -r 0 -is one of ...,06c1b3535b83251cf92c01258b5048beeab7a460,beyond r 0 : the importance of contact tracing...,,,,
6,"[Taylor Chin, Caroline O Buckee, Ayesha S Mahmud]",3 . cc-by-nc-nd 4.0 international license it i...,090b6c8b3df30bc248221869f673a2d970caa1b9,quantifying the success of measles vaccination...,,,,
7,"[Sarah Krieg, Fabian Pott, Laura Eckei, Maud V...",. cc-by-nc-nd 4.0 international license author...,09c9fcabc66a106e01ef42247cbd86b6d85bd67f,mono-adp-ribosylation by artd10 restricts chik...,,,,
8,"[Lea Gaucherand, Brittany K Porter, Summer K S...",many viruses globally shut off host gene expre...,09ec8daa8e32168d92d05b86de1784c639685fb4,the influenza a virus endoribonuclease pa-x us...,,,,
9,"[Shenglan Shang, Jiaqi Wu, Xiaoli Li, Xin Liu,...",sepsis is a leading cause of death worldwide (...,0a32446730827ad8152c6a61e4738e4e0b231412,artesunate interacts with vitamin d receptor t...,,,,
