# Introduction

Two subgroups of code:
1. Featuere Extraction from the Annotations (Data Turk)
1. Train classifiers based on Featuere Extraction.

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
from importlib import reload

# zoomies
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count

# custom data loading functions
import load_data
import clean_data
import custom_feature_extraction
import custom_keras_metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

from tensorflow.metrics import auc as tf_auc
from tensorflow import local_variables_initializer
import keras
import keras.backend as K
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense, Conv1D, \
        MaxPooling1D, LSTM, Flatten, BatchNormalization

Using TensorFlow backend.


In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
annotations ='../data/data_turk/Annotations04-05-19.json'

In [5]:
reload(load_data)
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,permission_statement,1,i give my permission for photographs/audio/vid...
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi..."
2,NON_permission_statement.,1,this consent form will be filed securely in an...
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial..."
4,permission_statement,1,"""i hereby certify that to the best of my knowl..."


### Convert Class Labels

In [6]:
to  = 'label'
field = 'annotation'
    
df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row, field), axis =1)   

In [7]:
# quick summary of the data 
print('positive class:', df['label'].sum())
print('total: ', len(df))
print('total unique: ', len(set(df['text'])))
print('ratio: ', df['label'].sum()/len(df))

positive class: 579
total:  2468
total unique:  2343
ratio:  0.2346029173419773


# Feature Extraction

### Simple Count Features

In [8]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

In [9]:
df['noun_count'] = df['text'].apply(lambda x: custom_feature_extraction.check_pos_tag(x, 'noun'))
df['verb_count'] = df['text'].apply(lambda x: custom_feature_extraction.check_pos_tag(x, 'verb'))
df['adj_count'] = df['text'].apply(lambda x: custom_feature_extraction.check_pos_tag(x, 'adj'))
df['adv_count'] = df['text'].apply(lambda x: custom_feature_extraction.check_pos_tag(x, 'adv'))
df['pron_count'] = df['text'].apply(lambda x: custom_feature_extraction.check_pos_tag(x, 'pron'))

In [10]:
nCores = cpu_count()
print(nCores) # just 4 for my machine

4


### Convert Raw Text to spaCy for more feature extraction

In [11]:
convertFrom = 'text'
convertTo = 'textDOC'

df[convertTo] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x :clean_data.getDocObjects(x, convertFrom),axis=1)).\
   compute(scheduler='threads')

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,124,19,6.2,3,0,0,0,0,0,"(i, give, my, permission, for, photographs, /,..."
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,144,23,6.0,7,0,0,0,0,0,"("", (, if, applicable, ,, add, ), information,..."
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,61,11,5.083333,1,0,0,0,0,0,"(this, consent, form, will, be, filed, securel..."
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,216,37,5.684211,7,0,0,0,0,0,"("", [, include, circumstances, ,, if, any, ,, ..."
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,186,31,5.8125,7,0,0,0,0,0,"("", i, hereby, certify, that, to, the, best, o..."


### Sentence Vectors
https://spacy.io/usage/vectors-similarity

[1] Models for the spaCy Natural Language Processing (NLP) library: explosion/spacy-models. Explosion, 2019.

_"English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."_

In [12]:
reload(custom_feature_extraction)

df['sent_vec'] = df.apply(lambda row: custom_feature_extraction.getSentenceVectors(row), axis=1)

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC,sent_vec
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,124,19,6.2,3,0,0,0,0,0,"(i, give, my, permission, for, photographs, /,...","[-0.029749835, 0.12945084, -0.16006051, 0.0601..."
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,144,23,6.0,7,0,0,0,0,0,"("", (, if, applicable, ,, add, ), information,...","[-0.064744644, 0.179507, -0.275384, -0.0227434..."
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,61,11,5.083333,1,0,0,0,0,0,"(this, consent, form, will, be, filed, securel...","[0.0638755, -0.007266668, -0.09555017, 0.10957..."
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,216,37,5.684211,7,0,0,0,0,0,"("", [, include, circumstances, ,, if, any, ,, ...","[-0.14210404, 0.21598653, -0.22471203, -0.0546..."
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,186,31,5.8125,7,0,0,0,0,0,"("", i, hereby, certify, that, to, the, best, o...","[-0.020474927, 0.16413753, -0.1526114, -0.0300..."


### Noun_Chunks

In [13]:
df['noun_chunks'] = df.apply(lambda row: custom_feature_extraction.getNounChunks(row), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC,sent_vec,noun_chunks
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,124,19,6.2,3,0,0,0,0,0,"(i, give, my, permission, for, photographs, /,...","[-0.029749835, 0.12945084, -0.16006051, 0.0601...","[i, my permission, photographs/audio/video rec..."
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,144,23,6.0,7,0,0,0,0,0,"("", (, if, applicable, ,, add, ), information,...","[-0.064744644, 0.179507, -0.275384, -0.0227434...","[information, individuals, organizations, you,..."
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,61,11,5.083333,1,0,0,0,0,0,"(this, consent, form, will, be, filed, securel...","[0.0638755, -0.007266668, -0.09555017, 0.10957...","[this consent form, an official area]"
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,216,37,5.684211,7,0,0,0,0,0,"("", [, include, circumstances, ,, if, any, ,, ...","[-0.14210404, 0.21598653, -0.22471203, -0.0546...","[circumstances, partial payment, no payment, i..."
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,186,31,5.8125,7,0,0,0,0,0,"("", i, hereby, certify, that, to, the, best, o...","[-0.020474927, 0.16413753, -0.1526114, -0.0300...","[i, my knowledge, the person, who, this consen..."


In [14]:
reload(custom_feature_extraction)

<module 'custom_feature_extraction' from '/Users/milk/Desktop/git/permission_statement_extraction/notebooks/custom_feature_extraction.py'>

### Convert chunks to one hot

In [15]:
df = custom_feature_extraction.convertNounChunkstoOneHot(df)
# df.head()

### Convert embeddings to one hot

In [16]:
df = custom_feature_extraction.convertVectoOneHot(df)
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,...,vec_posi290,vec_posi291,vec_posi292,vec_posi293,vec_posi294,vec_posi295,vec_posi296,vec_posi297,vec_posi298,vec_posi299
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,124,19,6.2,3,0,0,...,-0.236962,0.049824,0.063998,-0.000753,0.165136,-0.012342,0.01537,0.058096,-0.013941,0.123927
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,144,23,6.0,7,0,0,...,-0.268256,0.110001,-0.037784,-0.06626,0.008676,-0.105688,-0.123062,-0.156906,0.128169,0.060226
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,61,11,5.083333,1,0,0,...,-0.166543,0.088511,0.05547,-0.265956,0.026951,-0.099905,0.018878,-0.014592,0.047887,0.186848
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,216,37,5.684211,7,0,0,...,-0.190944,0.043685,-0.074062,-0.096643,0.058032,-0.028758,-0.141962,-0.111375,0.104723,0.044907
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,186,31,5.8125,7,0,0,...,-0.160743,0.076126,0.034459,-0.084644,0.055868,0.013227,0.035661,0.04439,0.051542,0.043067


### Drop Columns 

In [17]:
columns_to_drop = [
    'text',
    'annotation',
    'sent_vec',
    'textDOC'
]

In [18]:
df = df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,fileID,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,...,vec_posi290,vec_posi291,vec_posi292,vec_posi293,vec_posi294,vec_posi295,vec_posi296,vec_posi297,vec_posi298,vec_posi299
0,1,1,124,19,6.2,3,0,0,0,0,...,-0.236962,0.049824,0.063998,-0.000753,0.165136,-0.012342,0.01537,0.058096,-0.013941,0.123927
1,1,0,144,23,6.0,7,0,0,0,0,...,-0.268256,0.110001,-0.037784,-0.06626,0.008676,-0.105688,-0.123062,-0.156906,0.128169,0.060226
2,1,0,61,11,5.083333,1,0,0,0,0,...,-0.166543,0.088511,0.05547,-0.265956,0.026951,-0.099905,0.018878,-0.014592,0.047887,0.186848
3,1,0,216,37,5.684211,7,0,0,0,0,...,-0.190944,0.043685,-0.074062,-0.096643,0.058032,-0.028758,-0.141962,-0.111375,0.104723,0.044907
4,1,1,186,31,5.8125,7,0,0,0,0,...,-0.160743,0.076126,0.034459,-0.084644,0.055868,0.013227,0.035661,0.04439,0.051542,0.043067


### Training Data Split


In [19]:
train, test = train_test_split(df,stratify=df['label'],test_size=0.3, random_state=1729)

In [20]:
y_train = train['label']
y_test = test['label']

X_train = train.drop(['label', 'fileID'], axis=1)
X_test = test.drop(['label', 'fileID'], axis=1)

In [21]:
sum(y_train)

405

### SMOTE
Citation:
[1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and W. P. Kegelmeyer, “SMOTE: Synthetic Minority Over-sampling Technique,” 1, vol. 16, pp. 321–357, Jun. 2002.

_"method of over-sampling the minority (abnormal) class and under-sampling the majority (normal) class can achieve better classifier performance (in ROC space) than only under-sampling the majority class. This paper also shows that a combination of our method of over-sampling the minority class and under-sampling the majority class can achieve better classifier performance (in ROC space) than varying the loss ratios in Ripper or class priors in Naive Bayes. Our method of over-sampling the minority class involves creating synthetic minority class examples."_

In [22]:
sm = SMOTE(random_state=1729, ratio = 1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [23]:
sum(y_train)

1322

## Baseline Classifiers

In [24]:
models = []

models.append(("KNeighbors",
               KNeighborsClassifier(weights='distance')))

models.append(("LogisticRegression",
               LogisticRegression(solver='liblinear',
                                  max_iter=1000000,
                                  class_weight={1:.21},
                                  penalty='l1')))

dtBase = DecisionTreeClassifier(max_depth=10, 
                               max_features=9,
                               class_weight={1:.2})

models.append(("DecisionTree",dtBase))

rdfBase = RandomForestClassifier(n_estimators=1000,
                                class_weight={1:.2})

models.append(("RandomForest",rdfBase))


rf2 = RandomForestClassifier(n_estimators=1000, 
                             criterion='gini',
                             max_depth=10,
                             class_weight={1:.2})

models.append(("RandomForest-2",rf2))

models.append(("BaggingClassifier",
               BaggingClassifier(DecisionTreeClassifier(max_depth=3),
                                n_estimators=100,
                                max_features=9)))

models.append(("BaggingClassifier-2",
               BaggingClassifier(dtBase,
                                n_estimators=100,
                                max_features=9)))

models.append(("AdaBoostClassifier",
               AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                                  algorithm="SAMME",
                                  n_estimators=1000)))

models.append(("GradientBoostingClassifier",
               GradientBoostingClassifier(n_estimators=100, 
                                          max_leaf_nodes=4, 
                                          max_depth = 10)))

In [25]:
result_rows = []
count = 1
results_frame = pd.DataFrame()

predictionsList = []
fitted_models = []

for name, model in models:
    model.fit(X_train, y_train)
    
    # store models in a list, in case we want 'em again
    fitted_models.append((name, model))
    
    # get predictions
    prediction_vec = model.predict(X_test)
    
    # save predictions - useful for further ensemble methods/voters
    predictionsList.append((name, prediction_vec))
    
#     # print a classifaction report for each model
#     report = classification_report(y_test, prediction_vec)
#     print(' #### ', name, '\n', report, '\n')
    
    # store result in data frame
    results_frame.loc[count, 'Model'] = name
    results_frame.loc[count, 'Accuracy'] = accuracy_score(y_test, prediction_vec)
    results_frame.loc[count, 'Precision'] = precision_score(y_test, prediction_vec)
    results_frame.loc[count, 'AUCROC'] = roc_auc_score(y_test, prediction_vec)

    # increment data frame 
    count = count + 1

  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

In [None]:
# add majority class classifier for base-line comparison
majority_class = np.zeros(len(y_test))

ac = accuracy_score(y_test, majority_class)
pr = precision_score(y_test, majority_class)
roc = roc_auc_score(y_test, majority_class)

results_frame.loc[count, 'Model'] = 'Naive Baseline - MajorityClassClassifier'
results_frame.loc[count, 'Accuracy'] = ac
results_frame.loc[count, 'Precision'] = pr
results_frame.loc[count, 'AUCROC'] = roc

count = count + 1

In [None]:
results_frame.head(len(results_frame))

## Neural Network (to migrate to models)
Neural Network Classifier Code (keras). No Need for it now, but may be useful in the event that BERT training time is too costly.

In [None]:
print(X_train.shape[1])
print(X_test.shape)

In [None]:
y_train = keras.utils.to_categorical(y_train, num_classes=2)
y_test = keras.utils.to_categorical(y_test, num_classes=2)

In [None]:
dim = X_train.shape[1]

# Create model
model = Sequential()
model.add(Dense(units=1000, activation='relu', input_dim=dim))
model.add(Dense(units=5000, activation='softmax'))
model.add(Dense(units=50, activation='relu'))
# model.add(MaxPooling1D())
# model.add(Conv1D(128, 5, activation='relu'))
model.add(Dense(units=50, activation='softmax'))
model.add(Dense(units=5000, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
#               optimizer=keras.optimizers.Adagrad(lr=0.0001),
              metrics=['accuracy', 
                       custom_keras_metrics.keras_precision, 
                       custom_keras_metrics.keras_recall, 
                       custom_keras_metrics.keras_auc])

# Train
mod = model.fit(X_train, y_train, validation_data=(X_test,y_test), 
          epochs=20, batch_size=7)

In [None]:
print(model.summary())

In [None]:
%matplotlib inline
plt.plot(mod.history['keras_precision'])
plt.plot(mod.history['val_keras_precision'])
plt.title('model keras_precision')
plt.ylabel('precision')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(mod.history['loss'])
plt.plot(mod.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
results_frame.loc[count, 'Model'] = 'Simple ANN'
results_frame.loc[count, 'Accuracy'] = mod.history['acc'][-1]
results_frame.loc[count, 'Precision'] = mod.history['keras_precision'][-1]
results_frame.loc[count, 'AUCROC'] = mod.history['val_keras_auc'][-1]

results_frame.head(len(results_frame))