## Annotation Feature Extraction
Extract a number of features from each annotation and export csv files for trainined classifiers.

In [1]:
# # here's how to get qt content in another notebook
# %run 'hueristic_extraction.ipynb'

In [2]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [3]:
import json
from pprint import pprint
import os
import pandas as pd
import numpy as np

# zoomies
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost,textblob, string
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from keras.preprocessing import text, sequence

# custom data loading functions
import load_data
import clean_data

Using TensorFlow backend.


In [5]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [6]:
test = '../data/data_turk/dummy_data.json'
annotations ='../data/data_turk/Annotations03-29-19.json'

## Import Data

In [7]:
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.
4,permission_statement,490,"""if you sign your name below, it means that yo..."


## Convert Labels to Binary

In [8]:
to  = 'label'
field = 'annotation'
    
df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row, field), axis =1)   

In [9]:
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
4,permission_statement,490,"""if you sign your name below, it means that yo...",1


## Quick Overview

In [10]:
print('positive class:', df['label'].sum())
print('total: ', len(df))
print('ratio: ', df['label'].sum()/len(df))

positive class: 238
total:  2063
ratio:  0.11536597188560349


## Simple Features

In [11]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

## POS Counts

In [12]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [13]:
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,3,2
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,18,17,7,4,2
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,108,16,6.352941,1,11,0,1,0,0
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,0,3
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,96,19,4.8,4,3,5,0,0,4


In [14]:
df['label'].sum()

238

## Convert to spaCy object

In [15]:
nCores = cpu_count()
print(nCores) # just 4 for my machine

4


In [16]:
convertFrom = 'text'
convertTo = 'textDOC'

df[convertTo] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x :clean_data.getDocObjects(x, convertFrom),axis=1)).\
   compute(scheduler='threads')

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,3,2,"(/, my, child, has, already, had, dtpa, vaccin..."
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,18,17,7,4,2,"(mobile, email, pre, vaccination, checklist, (..."
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,108,16,6.352941,1,11,0,1,0,0,"(date, \t, signature, of, child, /, adolescent..."
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,0,3,"(all, you, have, to, do, is, tell, us, you, wa..."
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,96,19,4.8,4,3,5,0,0,4,"("", if, you, sign, your, name, below, ,, it, m..."


Notes:
- looks like vector is always the same size, while tensor is variable
- noun_chunks look critical (perhaps for the next stage).

In [17]:
def getSentenceVectors(row):
    """ get spaCy vectors for each sent """
    
    sent = row['textDOC']
    
    return (sent.vector)

df['sent_vec'] = df.apply(lambda row: getSentenceVectors(row), axis=1)

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC,sent_vec
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,3,2,"(/, my, child, has, already, had, dtpa, vaccin...","[-0.059527863, 0.09820726, -0.17988098, -0.005..."
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,18,17,7,4,2,"(mobile, email, pre, vaccination, checklist, (...","[-0.07928554, 0.17612715, -0.18236749, 0.03185..."
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,108,16,6.352941,1,11,0,1,0,0,"(date, \t, signature, of, child, /, adolescent...","[0.03099135, 0.10967306, -0.26234573, -0.08817..."
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,0,3,"(all, you, have, to, do, is, tell, us, you, wa...","[-0.0023037991, 0.18009607, -0.3351283, -0.047..."
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,96,19,4.8,4,3,5,0,0,4,"("", if, you, sign, your, name, below, ,, it, m...","[-0.0071864375, 0.22320907, -0.26941213, 0.008..."


In [18]:
def getNounChunks(row):
    """ get spaCy noun_chunks for each sent """
    
    chunks = []
    
    sent = row['textDOC']
    for chnk in list(sent.noun_chunks):
        chunks.append(chnk.text)
        
    return chunks

df['noun_chunks'] = df.apply(lambda row: getNounChunks(row), axis=1)
df.head()

df2 = df

In [19]:
mlb = MultiLabelBinarizer()

In [20]:
one_hot_chunks = pd.DataFrame(mlb.fit_transform(df2.pop('noun_chunks')),
                          columns=mlb.classes_,
                          index=df.index)

In [21]:
one_hot_chunks = one_hot_chunks.drop(['label'], axis=1)

In [23]:
df2 = pd.concat([df2, one_hot_chunks], axis=1)
df2.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,...,your vaccinations,your willingness,your wound,your written authorization,your/his/her authorization,your/his/her permission,your/your child s consent,yourself,|,|wi||
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,...,0,0,0,0,0,0,0,0,0,0
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,18,17,...,0,0,0,0,0,0,0,0,0,0
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,108,16,6.352941,1,11,0,...,0,0,0,0,0,0,0,0,0,0
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,...,0,0,0,0,0,0,0,0,0,0
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,96,19,4.8,4,3,5,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df2['label'].sum()

238

In [25]:
vec_column = ['vec_posi' + str(i) for i in range(0,300)]
vec_df = pd.DataFrame(df['sent_vec'].values.tolist(), columns=vec_column)
df2 = pd.concat([df2, vec_df], axis=1)
df2.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,...,vec_posi290,vec_posi291,vec_posi292,vec_posi293,vec_posi294,vec_posi295,vec_posi296,vec_posi297,vec_posi298,vec_posi299
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,...,-0.089253,0.095976,-0.054682,0.099267,0.218827,-0.010469,-0.055917,-0.056714,0.183427,0.140657
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,388,64,5.969231,3,18,17,...,-0.156598,0.104328,-0.023339,-0.023068,0.15323,-0.122438,-0.105815,-0.102243,0.122092,0.108252
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,108,16,6.352941,1,11,0,...,-0.006131,0.152385,0.002633,0.056515,0.129164,0.014515,-0.094671,-0.095217,-0.050106,-0.022726
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,...,-0.186385,0.085673,-0.17904,-0.034618,0.136212,-0.118155,-0.117411,-0.014329,0.160683,0.124818
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,96,19,4.8,4,3,5,...,-0.104249,0.099975,0.013024,-0.101493,0.068803,-0.150346,-0.19177,0.027561,0.084476,0.0534


In [26]:
df2['label'].sum()

238

In [27]:
columns_to_drop = [
    'text',
    'annotation',
    'sent_vec',
    'textDOC'
]

In [28]:
new_df = df2.drop(columns_to_drop, axis=1)
new_df.head()

Unnamed: 0,fileID,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,...,vec_posi290,vec_posi291,vec_posi292,vec_posi293,vec_posi294,vec_posi295,vec_posi296,vec_posi297,vec_posi298,vec_posi299
0,370,0,117,23,4.875,1,8,8,1,3,...,-0.089253,0.095976,-0.054682,0.099267,0.218827,-0.010469,-0.055917,-0.056714,0.183427,0.140657
1,370,0,388,64,5.969231,3,18,17,7,4,...,-0.156598,0.104328,-0.023339,-0.023068,0.15323,-0.122438,-0.105815,-0.102243,0.122092,0.108252
2,490,0,108,16,6.352941,1,11,0,1,0,...,-0.006131,0.152385,0.002633,0.056515,0.129164,0.014515,-0.094671,-0.095217,-0.050106,-0.022726
3,490,0,47,12,3.615385,1,0,6,0,0,...,-0.186385,0.085673,-0.17904,-0.034618,0.136212,-0.118155,-0.117411,-0.014329,0.160683,0.124818
4,490,1,96,19,4.8,4,3,5,0,0,...,-0.104249,0.099975,0.013024,-0.101493,0.068803,-0.150346,-0.19177,0.027561,0.084476,0.0534


In [29]:
df2['label'].sum()

238

In [30]:
for col in new_df.columns:
    if col == 'label':
        print(col)

label


In [31]:
train, test = train_test_split(new_df,stratify=new_df['label'],test_size=0.3)


In [32]:
def saveDFtoFile(df, path):
    """ save the dataframe as csv """
    
    df.to_csv(path, sep=',',index=False, header=True)


In [33]:
train_path = '../data/simple_classier_inputs/train.csv'
test_path = '../data/simple_classier_inputs/test.csv'

saveDFtoFile(train, train_path)
saveDFtoFile(test, test_path)