## Config

In [1]:
# # here's how to get qt content in another notebook
# %run 'hueristic_extraction.ipynb'

In [2]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [3]:
import json
from pprint import pprint
import os
import pandas as pd
import numpy as np

# zoomies
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost,textblob, string

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

# custom data loading functions
import load_data
import clean_data

Using TensorFlow backend.


In [4]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [5]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [6]:
test = '../data/data_turk/dummy_data.json'

## Import Data

In [7]:
df = load_data.getJSONData(test)
df.head()

Unnamed: 0,annotation,fileID,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.
2,NON_permission_statement.,490,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...


## Convert Labels to Binary

In [8]:
to  = 'label'
field = 'annotation'
    
df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row, field), axis =1)   

In [9]:
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0


## Quick Overview

In [10]:
print('positive class:', df['label'].sum())
print('total: ', len(df))
print('ratio: ', df['label'].sum()/len(df))

positive class: 116
total:  520
ratio:  0.2230769230769231


## Simple Features

In [11]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

## POS Counts

In [12]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [44]:
df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,3,2,"(/, my, child, has, already, had, dtpa, vaccin..."
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,0,3,"(all, you, have, to, do, is, tell, us, you, wa..."
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,214,32,6.484848,4,15,6,2,0,3,"(tufts, medical, center, tufts, university, de..."
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,136,24,5.44,4,5,4,1,1,2,"("", if, you, agree, to, being, audiotaped, but..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,67,15,4.1875,1,3,3,1,0,2,"(you, will, be, given, a, copy, of, this, form..."


## Convert to spaCy object

In [13]:
nCores = cpu_count()
print(nCores) # just 4 for my machine

4


In [14]:
convertFrom = 'text'
convertTo = 'textDOC'

df[convertTo] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x :clean_data.getDocObjects(x, convertFrom),axis=1)).\
   compute(scheduler='threads')

df.head()

Unnamed: 0,annotation,fileID,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,adv_count,pron_count,textDOC
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,3,2,"(/, my, child, has, already, had, dtpa, vaccin..."
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,0,3,"(all, you, have, to, do, is, tell, us, you, wa..."
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,214,32,6.484848,4,15,6,2,0,3,"(tufts, medical, center, tufts, university, de..."
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,136,24,5.44,4,5,4,1,1,2,"("", if, you, agree, to, being, audiotaped, but..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,67,15,4.1875,1,3,3,1,0,2,"(you, will, be, given, a, copy, of, this, form..."


Notes:
- looks like vector is always the same size, while tensor is variable
- noun_chunks look critical (perhaps for the next stage).

In [53]:
for sent in df['textDOC'].head(100):
    print('------')
#     print(sent.ents)
    print('noun_chunks: ', list(sent.noun_chunks))
#     print('vocab: ', sent.vocab)
#     print('vector: ', sent.vector)
    print('vector.shape: ', sent.vector.shape)
    print('vector_norm: ', sent.vector_norm)
    print('tensor.shape: ', sent.tensor.shape)
#     print(dir(sent))
    print()

------
noun_chunks:  [my child, dtpa vaccination, i, my child, only vaccine]
vector.shape:  (300,)
vector_norm:  3.1957632766068182
tensor.shape:  (23, 384)

------
noun_chunks:  [you, us, you]
vector.shape:  (300,)
vector_norm:  4.019685360797594
tensor.shape:  (13, 384)

------
noun_chunks:  [research title, study principal investigator, we, you, part, a research study, we]
vector.shape:  (300,)
vector_norm:  3.1050915619781208
tensor.shape:  (37, 384)

------
noun_chunks:  [you, any time, the interview, i, the recorder, your request]
vector.shape:  (300,)
vector_norm:  3.307398487308692
tensor.shape:  (28, 384)

------
noun_chunks:  [you, a copy, this form, your own records]
vector.shape:  (300,)
vector_norm:  3.704410080913383
tensor.shape:  (16, 384)

------
noun_chunks:  [compensation, information, payment, other types, compensation, method, timing, payment, you, this study, you, a $20 gift card, you, the interview]
vector.shape:  (300,)
vector_norm:  3.0437940659954026
tensor.sh

## Count Vecs

In [15]:
# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
# count_vect_fit = count_vect.fit_transform(df['text'])

# count_vec_df = pd.DataFrame(data=count_vect_fit.toarray()[0:,0:], 
#                          columns = count_vect.get_feature_names())

# df = pd.concat([df, count_vec_df], axis=1, sort=False)
# df.head()

In [16]:
# # create a count vectorizer object 
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
# tfidf_vec = tfidf_vect.fit_transform(df['text'])

# tfidf_df = pd.DataFrame(data=tfidf_vec.toarray()[0:,0:], 
#                          columns = count_vect.get_feature_names())

# df = pd.concat([df, tfidf_df], axis=1, sort=False)
# df.head()

In [17]:
# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

In [18]:
# # ngram level tf-idf 
# tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
# tfidf_vect_ngram.fit(df['text'])
# xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
# xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# # characters level tf-idf
# tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
# tfidf_vect_ngram_chars.fit(df['text'])
# xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
# xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

## Train-Test Split

In [19]:
# # split the dataset into training and validation datasets 
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['text'], df['label'],
#                                                                      test_size=.2,
#                                                                      random_state=42)

In [20]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(xtrain_count)
# topic_word = lda_model.components_ 
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [21]:
# """
# ## NOTE: 
# `minTermFrequencyThreshold = 0` will result in all 
# possible n_grams and will not scale as input size or 
# ngramSize increases. However, it is the most robust 
# representation of the sentence, and is worth exploring for the time being...
# """

# ngramSize = 5
# maxTermFrequencyThreshold = .8
# minTermFrequencyThreshold = .001

# def getTDIDFMatrix(corpus, ngram_range, max_df, min_df):
#     """ return td-idf matrix and terms """
    
#     tfidf_vectorizer = TfidfVectorizer(use_idf=True, 
#                                        ngram_range=(1,ngram_range),
#                                        max_df=max_df,min_df=min_df)
    
#     tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
#     terms = tfidf_vectorizer.get_feature_names()
    
#     return tfidf_matrix, terms

# # save to a variable 
# tdidf_matrix, tdidf_terms = getTDIDFMatrix(corpus, 
#                                            ngramSize, 
#                                            maxTermFrequencyThreshold,
#                                            minTermFrequencyThreshold)

# # # print tests
# # print('\nfirst few terms:')
# # [print(" ", x) for x in tdidf_terms[:10]]

# print('\nNumber of terms:', len(tdidf_terms))