# Introduction

Two subgroups of code:
1. Featuere Extraction from the Annotations (Data Turk)
1. Train classifiers based on Featuere Extraction.

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt

# zoomies
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count

# custom data loading functions
import load_data
import clean_data

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

import keras
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten, BatchNormalization

Using TensorFlow backend.


In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
annotations ='../data/data_turk/Annotations04-02-19.json'

In [5]:
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:"""
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu..."
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...


### Convert Class Labels

In [6]:
to  = 'label'
field = 'annotation'
    
df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row, field), axis =1)   

In [7]:
# quick summary of the data 
print('positive class:', df['label'].sum())
print('total: ', len(df))
print('ratio: ', df['label'].sum()/len(df))

positive class: 354
total:  2775
ratio:  0.12756756756756757


# Feature Extraction

### Simple Count Features

In [8]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

In [9]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [10]:
nCores = cpu_count()
print(nCores) # just 4 for my machine

4


### Convert Raw Text to spaCy for more feature extraction

In [11]:
convertFrom = 'text'
convertTo = 'textDOC'

df[convertTo] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x :clean_data.getDocObjects(x, convertFrom),axis=1)).\
   compute(scheduler='threads')

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,49,6,7.0,5,0,0,0,0,0,"("", yes, ,, i, consent, parent, /, legal, guar..."
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,0,0,0,0,0,"(/, my, child, has, already, had, dtpa, vaccin..."
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,127,20,6.047619,6,0,0,0,0,0,"("", prior, to, administering, the, vaccine(s, ..."
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,310,55,5.535714,7,0,0,0,0,0,"("", ., -, within, 2, business, days, of, immun..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,0,0,0,0,0,"(mobile, email, pre, vaccination, checklist, (..."


### Sentence Vectors

In [12]:
def getSentenceVectors(row):
    """ get spaCy vectors for each sent """
    
    sent = row['textDOC']
    
    return (sent.vector)

df['sent_vec'] = df.apply(lambda row: getSentenceVectors(row), axis=1)

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC,sent_vec
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,49,6,7.0,5,0,0,0,0,0,"("", yes, ,, i, consent, parent, /, legal, guar...","[0.003146667, 0.22139668, -0.3070803, -0.03093..."
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,0,0,0,0,0,"(/, my, child, has, already, had, dtpa, vaccin...","[-0.059527863, 0.09820726, -0.17988098, -0.005..."
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,127,20,6.047619,6,0,0,0,0,0,"("", prior, to, administering, the, vaccine(s, ...","[0.039237, 0.07936116, -0.18690301, 0.00432068..."
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,310,55,5.535714,7,0,0,0,0,0,"("", ., -, within, 2, business, days, of, immun...","[0.039489683, 0.18623652, -0.23708865, -0.0131..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,0,0,0,0,0,"(mobile, email, pre, vaccination, checklist, (...","[-0.07928554, 0.17612715, -0.18236749, 0.03185..."


### Noun_Chunks

In [13]:
def getNounChunks(row):
    """ get spaCy noun_chunks for each sent """
    
    chunks = []
    
    sent = row['textDOC']
    for chnk in list(sent.noun_chunks):
        chunks.append(chnk.text)
        
    return chunks

df['noun_chunks'] = df.apply(lambda row: getNounChunks(row), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC,sent_vec,noun_chunks
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,49,6,7.0,5,0,0,0,0,0,"("", yes, ,, i, consent, parent, /, legal, guar...","[0.003146667, 0.22139668, -0.3070803, -0.03093...",[i]
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,0,0,0,0,0,"(/, my, child, has, already, had, dtpa, vaccin...","[-0.059527863, 0.09820726, -0.17988098, -0.005...","[my child, dtpa vaccination, i, my child, only..."
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,127,20,6.047619,6,0,0,0,0,0,"("", prior, to, administering, the, vaccine(s, ...","[0.039237, 0.07936116, -0.18690301, 0.00432068...","[the student, the information]"
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,310,55,5.535714,7,0,0,0,0,0,"("", ., -, within, 2, business, days, of, immun...","[0.039489683, 0.18623652, -0.23708865, -0.0131...","[2 business days, immunisation, i, i, a signed..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,0,0,0,0,0,"(mobile, email, pre, vaccination, checklist, (...","[-0.07928554, 0.17612715, -0.18236749, 0.03185...","[mobile email, pre vaccination checklist, any ..."


In [14]:
mlb = MultiLabelBinarizer()
one_hot_chunks = pd.DataFrame(mlb.fit_transform(df.pop('noun_chunks')),
                          columns=mlb.classes_,
                          index=df.index)

In [15]:
one_hot_chunks = one_hot_chunks.drop(['label'], axis=1)

In [16]:
df = pd.concat([df, one_hot_chunks], axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,...,"your x-rays, photographs",your/his/her authorization,your/his/her permission,your/your child s consent,your/your surviving child s blood sample,your/your surviving child s participation,yours,yourself,|,|wi||
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,49,6,7.0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,127,20,6.047619,6,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,310,55,5.535714,7,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
vec_column = ['vec_posi' + str(i) for i in range(0,300)]
vec_df = pd.DataFrame(df['sent_vec'].values.tolist(), columns=vec_column)
df = pd.concat([df, vec_df], axis=1)
# df.head()

### Drop Columns 

In [18]:
columns_to_drop = [
    'text',
    'annotation',
    'sent_vec',
    'textDOC'
]

In [19]:
df = df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,fileID,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,...,vec_posi290,vec_posi291,vec_posi292,vec_posi293,vec_posi294,vec_posi295,vec_posi296,vec_posi297,vec_posi298,vec_posi299
0,370,1,49,6,7.0,5,0,0,0,0,...,-0.043365,-0.003434,-0.100498,0.056136,0.129285,0.052305,0.020964,0.044213,-0.012429,-0.02865
1,370,0,117,23,4.875,1,0,0,0,0,...,-0.089253,0.095976,-0.054682,0.099267,0.218827,-0.010469,-0.055917,-0.056714,0.183427,0.140657
2,370,0,127,20,6.047619,6,0,0,0,0,...,-0.106708,0.085487,0.052715,-0.048474,0.033206,-0.106255,-0.094731,-0.027678,0.068143,0.050415
3,370,0,310,55,5.535714,7,0,0,0,0,...,-0.118413,0.158914,0.053155,-0.013494,0.151714,-0.076653,-0.057089,-0.04908,0.096872,0.089844
4,370,0,388,64,5.969231,3,0,0,0,0,...,-0.156598,0.104328,-0.023339,-0.023068,0.15323,-0.122438,-0.105815,-0.102243,0.122092,0.108252


# Classifier Training and Eval

### Train - Test Split

In [20]:
train, test = train_test_split(df,stratify=df['label'],test_size=0.3, random_state=1729)

In [21]:
y_train = train['label']
y_test = test['label']

X_train = train.drop(['label', 'fileID'], axis=1)
X_test = test.drop(['label', 'fileID'], axis=1)

In [22]:
sum(y_train)

248

In [23]:
sm = SMOTE(random_state=1729, ratio = 1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [24]:
sum(y_train)

1694

In [25]:
models = []

models.append(("KNeighbors",
               KNeighborsClassifier(weights='distance')))

models.append(("LogisticRegression",
               LogisticRegression(solver='liblinear',
                                  max_iter=1000000,
                                  class_weight={1:.21},
                                  penalty='l1')))

dtBase = DecisionTreeClassifier(max_depth=10, 
                               max_features=1000,
                               class_weight={1:.2})

models.append(("DecisionTree",dtBase))

rdfBase = RandomForestClassifier(n_estimators=1000,
                                class_weight={1:.2})

models.append(("RandomForest",rdfBase))


rf2 = RandomForestClassifier(n_estimators=1000, 
                             criterion='gini',
                             max_depth=10,
                             class_weight={1:.2})

models.append(("RandomForest-2",rf2))

models.append(("BaggingClassifier",
               BaggingClassifier(DecisionTreeClassifier(max_depth=3),
                                n_estimators=100,
                                max_features=1000)))

models.append(("BaggingClassifier-2",
               BaggingClassifier(dtBase,
                                n_estimators=100,
                                max_features=1000)))

# models.append(("AdaBoostClassifier",
#                AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
#                                   algorithm="SAMME",
#                                   n_estimators=1000)))
               
# models.append(("AdaBoostClassifier-2",
#                AdaBoostClassifier(dtBase,
#                                   algorithm="SAMME",
#                                   n_estimators=100)))

# models.append(("GradientBoostingClassifier",
#                GradientBoostingClassifier(n_estimators=100, 
#                                           max_leaf_nodes=4, 
#                                           max_depth = 10)))

In [26]:
result_rows = []
count = 1
results_frame = pd.DataFrame()

predictionsList = []
fitted_models = []

for name, model in models:
    model.fit(X_train, y_train)
    
    # store models in a list, in case we want 'em again
    fitted_models.append((name, model))
    
    # get predictions
    prediction_vec = model.predict(X_test)
    
    # save predictions - useful for further ensemble methods/voters
    predictionsList.append((name, prediction_vec))
    
    # print a classifaction report for each model
    report = classification_report(y_test, prediction_vec)
    print(' #### ', name, '\n', report, '\n')
    
    # store result in data frame
    results_frame.loc[count, 'Model'] = name
    results_frame.loc[count, 'Accuracy'] = accuracy_score(y_test, prediction_vec)
    results_frame.loc[count, 'Precision'] = precision_score(y_test, prediction_vec)
    results_frame.loc[count, 'AUCROC'] = roc_auc_score(y_test, prediction_vec)

    # increment data frame 
    count = count + 1

 ####  KNeighbors 
               precision    recall  f1-score   support

           0       0.90      0.55      0.68       727
           1       0.15      0.57      0.24       106

   micro avg       0.55      0.55      0.55       833
   macro avg       0.53      0.56      0.46       833
weighted avg       0.80      0.55      0.62       833
 

 ####  LogisticRegression 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93       727
           1       0.49      0.27      0.35       106

   micro avg       0.87      0.87      0.87       833
   macro avg       0.70      0.62      0.64       833
weighted avg       0.85      0.87      0.86       833
 

 ####  DecisionTree 
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       727
           1       0.40      0.17      0.24       106

   micro avg       0.86      0.86      0.86       833
   macro avg       0.64      0.57      0.58       833


In [27]:
# add majority class classifier for base-line comparison
majority_class = np.zeros(len(y_test))

ac = accuracy_score(y_test, majority_class)
pr = precision_score(y_test, majority_class)
roc = roc_auc_score(y_test, majority_class)

results_frame.loc[count, 'Model'] = 'Naive Baseline - MajorityClassClassifier'
results_frame.loc[count, 'Accuracy'] = ac
results_frame.loc[count, 'Precision'] = pr
results_frame.loc[count, 'AUCROC'] = roc

  'precision', 'predicted', average, warn_for)


In [28]:
results_frame.head(len(results_frame))

Unnamed: 0,Model,Accuracy,Precision,AUCROC
1,KNeighbors,0.54982,0.154242,0.556747
2,LogisticRegression,0.871549,0.491525,0.61616
3,DecisionTree,0.861945,0.4,0.566336
4,RandomForest,0.877551,0.535714,0.623628
5,RandomForest-2,0.881152,1.0,0.533019
6,BaggingClassifier,0.806723,0.327044,0.671693
7,BaggingClassifier-2,0.883553,0.764706,0.55857
8,Naive Baseline - MajorityClassClassifier,0.872749,0.0,0.5


## (OBSOLETE)
Neural Network Classifier Code (keras). No Need for it now, but may be useful in the event that BERT training time is too costly.

In [29]:
# print(X_train.shape)
# print(X_test.shape)

In [30]:
# y_train = keras.utils.to_categorical(y_train, num_classes=2)
# y_test = keras.utils.to_categorical(y_test, num_classes=2)

In [31]:
# # Create model
# model = Sequential()
# model.add(Dense(units=1000, activation='relu', input_dim=5491))
# model.add(Dense(units=500, activation='relu'))
# model.add(Dense(units=70, activation='relu'))
# model.add(Dense(units=2, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer=keras.optimizers.Adagrad(lr=0.01),
#               metrics=['accuracy'])

# # Train
# mod = model.fit(X_train, y_train, validation_data=(X_test,y_test), 
#           epochs=20, batch_size=10)

In [32]:
# %matplotlib inline
# plt.plot(mod.history['acc'])
# plt.plot(mod.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [33]:
# plt.plot(mod.history['loss'])
# plt.plot(mod.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()