In [1]:
%load_ext dotenv
%dotenv

In [2]:
# get raw data from GCP bucket
import sys
import numpy as np
np.set_printoptions(threshold=1000)

import pandas as pd
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob
import json
import tempfile
from datetime import datetime

from google.cloud import storage

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 50)
# pd.set_option('display.width', 100)
# pd.set_option('display.max_colwidth', -1)

# The bucket name for the location of the data is in the .env file
BUCKET_NAME = os.environ['BUCKET_NAME']
DATA_FILE = 'labeled_npi.csv'
df_pkl = pd.read_pickle("data.pkl")

In [3]:
class Preprocesser:
    def __init__(self, df):
        self.df_labels = pd.read_csv(DATA_FILE)
        self.keywords = ['incident command system',
                         'emergency operations',
                         'joint information center',
                         'social distancing',
                         'childcare closers',
                         'travel advisory',
                         'travel warning',
                         'isolation',
                         'quarantine',
                         'mass gathering cancellations',
                         'school closures',
                         'facility closures',
                         'evacuation',
                         'relocation',
                         'restricting travel',
                         'travel ban',
                         'patient cohort',
                         'npi']
        self.occurances_minimum = 1
        self.df_full = df.copy()
        print(self.df_full.shape)
        self.key_slice()
        print(self.df_full.shape)
        self.npi_slice()
        print(self.df_full.shape)
        self.df_full = self.df_full.merge(self.df_labels, on="title", how="inner")
        self.df_full = self.df_full.loc[self.df_full.isNPI.notna()]
            
    def key_slice(self):
        self.df_full = self.df_full[self.df_full['abstract'].str.contains('|'.join(self.keywords), na=False, regex=True)].reset_index(drop=True)
        
    def npi_slice(self):
        def get_count(row):
            return sum([row['abstract'].count(keyword) for keyword in self.keywords])
        self.df_full = self.df_full[self.df_full.apply(get_count, axis=1) >= self.occurances_minimum]
        
    def remove_stopwords(self,columns):
        stop = stopwords.words('english')
        for col in columns:
            self.df_full[col] = self.df_full[col].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    def to_tfidf(self,columns):
        for col in columns:
            tfidfv = TfidfVectorizer()
            self.df_full[col + '_tfidf'] = list(tfidfv.fit_transform(self.df_full[col]).toarray())
            
    def remove_punc(self, columns):
        for col in columns:
            self.df_full[col] = self.df_full[col].str.replace('[^a-zA-Z\s]+','')
        
def display_wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color='white').generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [4]:
def pca_apply(df, columns, n_comp):
    new_df = df.copy()
    for col in columns:
        pca = PCA(n_components=n_comp)
        new_df[col+'_pca'] = list(pca.fit_transform(np.stack(df[col].to_numpy())))
    return new_df

def apply_scaler(df, columns):
    new_df = df.copy()
    for col in columns:
        scaler = StandardScaler()
        new_df[col + '_scaled'] = list(scaler.fit_transform(np.stack(df[col].to_numpy())))
    return new_df


In [5]:
data_obj = Preprocesser(df_pkl)

(1612, 6)
(1612, 6)
(1612, 6)


In [6]:
data_obj.df_full.isNPI.value_counts()

0.0    420
1.0    187
Name: isNPI, dtype: int64

In [7]:
data_obj.remove_punc(['body_text','abstract'])
data_obj.remove_stopwords(['body_text', 'abstract'])
data_obj.to_tfidf(['body_text', 'abstract'])
pca_df = pca_apply(data_obj.df_full, ['abstract_tfidf','body_text_tfidf'], 50)
scaled_df = apply_scaler(pca_df,['abstract_tfidf_pca','body_text_tfidf_pca'])

In [8]:
scaled_df.columns

Index(['paper_id', 'title', 'author_list', 'abstract', 'body_text', 'doi',
       'Unnamed: 0', 'Unnamed: 0.1', 'isNPI', 'body_text_tfidf',
       'abstract_tfidf', 'abstract_tfidf_pca', 'body_text_tfidf_pca',
       'abstract_tfidf_pca_scaled', 'body_text_tfidf_pca_scaled'],
      dtype='object')

In [27]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [28]:
X = np.stack(scaled_df["body_text_tfidf_pca_scaled"].to_numpy())
y = scaled_df["isNPI"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [33]:
dtrain.num_col()

50

In [32]:
dtest.num_col()

50

In [37]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'

In [38]:
num_round = 100
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.87907	train-auc:0.88473
[1]	eval-auc:0.90555	train-auc:0.91041
[2]	eval-auc:0.91651	train-auc:0.94459
[3]	eval-auc:0.92841	train-auc:0.95935
[4]	eval-auc:0.91917	train-auc:0.96899
[5]	eval-auc:0.92121	train-auc:0.97431
[6]	eval-auc:0.92998	train-auc:0.98291
[7]	eval-auc:0.92199	train-auc:0.98690
[8]	eval-auc:0.93045	train-auc:0.99030
[9]	eval-auc:0.93327	train-auc:0.99076
[10]	eval-auc:0.93515	train-auc:0.99240
[11]	eval-auc:0.93390	train-auc:0.99510
[12]	eval-auc:0.92575	train-auc:0.99645
[13]	eval-auc:0.93734	train-auc:0.99738
[14]	eval-auc:0.93484	train-auc:0.99797
[15]	eval-auc:0.93108	train-auc:0.99851
[16]	eval-auc:0.93515	train-auc:0.99885
[17]	eval-auc:0.92920	train-auc:0.99937
[18]	eval-auc:0.92199	train-auc:0.99917
[19]	eval-auc:0.92857	train-auc:0.99943
[20]	eval-auc:0.92763	train-auc:0.99971
[21]	eval-auc:0.93264	train-auc:0.99973
[22]	eval-auc:0.93108	train-auc:0.99967
[23]	eval-auc:0.93327	train-auc:0.99965
[24]	eval-auc:0.93640	train-auc:0.99965
[25]	eval-