## Config

In [15]:
import json
from pprint import pprint
import os
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [14]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
test = '../input_data/data_turk/dummy_data.json'

## Import Data

In [5]:
new_rows = []

with open(test) as f:
    for line in f: # need to load each line as a separate json object
        dat_dict = json.loads(line)
        
        content = dat_dict['content'].split("]],")[1]
        annotation = dat_dict['annotation']['labels'][0]
        
        row = {
            'annotation':annotation,
            'text':content
        }
        
        new_rows.append(row)
        
df = pd.DataFrame(new_rows)

df.head()

Unnamed: 0,annotation,text
0,NON_permission_statement.,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,all you have to do is tell us you want to stop.
2,NON_permission_statement.,tufts medical center tufts university departme...
3,permission_statement,"""if you agree to being audiotaped but feel unc..."
4,NON_permission_statement.,you will be given a copy of this form to keep ...


## Convert Labels to Binary

In [16]:
to  = 'label'

def convertAnnotationtoBinary(row):
    """  convert to (0,1), where 1 = permission_statement """
    
    if str(row['annotation']).__contains__('NON'):
        return 0
    else:
        return 1
    
df[to] = df.apply(lambda row:convertAnnotationtoBinary(row), axis =1)   

In [6]:
df.head()

Unnamed: 0,annotation,text,label
0,NON_permission_statement.,"""if we can predict disease, we may be better a...",0
1,NON_permission_statement.,we will swipe the inside of the mouth once wit...,0
2,NON_permission_statement.,the irb also reviews research to make sure the...,0
3,NON_permission_statement.,"""if the test is canceled after the sequencing ...",0
4,NON_permission_statement.,"""during this time, we will ask you to make [nu...",0


## Quick Overview

In [7]:
print('positive class:', df['label'].sum())
print('total: ', len(df))
print('ratio: ', df['label'].sum()/len(df))

positive class: 116
total:  520
ratio:  0.2230769230769231


### Train - Test Split

## Simple Features

In [44]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 

## POS Counts

In [45]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Count Vecs

In [48]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect_fit = count_vect.fit_transform(df['text'])

count_vec_df = pd.DataFrame(data=count_vect_fit.toarray()[0:,0:], 
                         columns = count_vect.get_feature_names())

df = pd.concat([df, count_vec_df], axis=1, sort=False)
df.head()

Unnamed: 0,annotation,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,...,years,yes,yet,york,you,your,yourself,yy,zilavy,text.1
0,NON_permission_statement.,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,...,0,0,0,0,0,0,0,0,0,1.0
1,NON_permission_statement.,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,...,0,0,0,0,2,0,0,0,0,1.0
2,NON_permission_statement.,tufts medical center tufts university departme...,0,214,32,6.484848,4,15,6,2,...,0,0,0,0,1,0,0,0,0,
3,permission_statement,"""if you agree to being audiotaped but feel unc...",1,136,24,5.44,4,5,4,1,...,0,0,0,0,1,1,0,0,0,
4,NON_permission_statement.,you will be given a copy of this form to keep ...,0,67,15,4.1875,1,3,3,1,...,0,0,0,0,1,1,0,0,0,


In [49]:
# create a count vectorizer object 
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vec = tfidf_vect.fit_transform(df['text'])

tfidf_df = pd.DataFrame(data=tfidf_vec.toarray()[0:,0:], 
                         columns = count_vect.get_feature_names())

df = pd.concat([df, tfidf_df], axis=1, sort=False)
df.head()

Unnamed: 0,annotation,text,label,char_count,word_count,word_density,punctuation_count,noun_count,verb_count,adj_count,...,yes,yet,york,you,your,yourself,yy,zilavy,text.1,text.2
0,NON_permission_statement.,/ my child has already had dtpa vaccination i ...,0,117,23,4.875,1,8,8,1,...,0,0,0,0,0,0,0,0,1.0,1.0
1,NON_permission_statement.,all you have to do is tell us you want to stop.,0,47,12,3.615385,1,0,6,0,...,0,0,0,2,0,0,0,0,1.0,1.0
2,NON_permission_statement.,tufts medical center tufts university departme...,0,214,32,6.484848,4,15,6,2,...,0,0,0,1,0,0,0,0,,1.0
3,permission_statement,"""if you agree to being audiotaped but feel unc...",1,136,24,5.44,4,5,4,1,...,0,0,0,1,1,0,0,0,,
4,NON_permission_statement.,you will be given a copy of this form to keep ...,0,67,15,4.1875,1,3,3,1,...,0,0,0,1,1,0,0,0,,


In [None]:
# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

In [10]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

## Train-Test Split

In [None]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['text'], df['label'],
                                                                     test_size=.2,
                                                                     random_state=42)

In [15]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(xtrain_count)
# topic_word = lda_model.components_ 
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [16]:
# """
# ## NOTE: 
# `minTermFrequencyThreshold = 0` will result in all 
# possible n_grams and will not scale as input size or 
# ngramSize increases. However, it is the most robust 
# representation of the sentence, and is worth exploring for the time being...
# """

# ngramSize = 5
# maxTermFrequencyThreshold = .8
# minTermFrequencyThreshold = .001

# def getTDIDFMatrix(corpus, ngram_range, max_df, min_df):
#     """ return td-idf matrix and terms """
    
#     tfidf_vectorizer = TfidfVectorizer(use_idf=True, 
#                                        ngram_range=(1,ngram_range),
#                                        max_df=max_df,min_df=min_df)
    
#     tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
#     terms = tfidf_vectorizer.get_feature_names()
    
#     return tfidf_matrix, terms

# # save to a variable 
# tdidf_matrix, tdidf_terms = getTDIDFMatrix(corpus, 
#                                            ngramSize, 
#                                            maxTermFrequencyThreshold,
#                                            minTermFrequencyThreshold)

# # # print tests
# # print('\nfirst few terms:')
# # [print(" ", x) for x in tdidf_terms[:10]]

# print('\nNumber of terms:', len(tdidf_terms))

NameError: name 'corpus' is not defined