In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.3


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload
from matplotlib.pyplot import figure
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# keras/tensorflow
from tensorflow.metrics import auc as tf_auc
from tensorflow import local_variables_initializer
import keras
import keras.backend as K
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Conv2D, \
        MaxPooling1D, LSTM, Flatten, BatchNormalization,Embedding,Reshape, Dropout


# Local custom data loading functions
import load_data
import clean_data
import custom_feature_extraction
import custom_keras_metrics

In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

### Load annoations

In [4]:
annotations ='../data/data_turk/Annotations04-09-19.json'
df = load_data.getJSONData(annotations)

to  = 'label'
field = 'annotation'
df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row, field), axis =1)   
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,1,a copy of this entire consent form will be giv...,0
1,permission_statement,1,i give my permission for photographs/audio/vid...,1
2,NON_permission_statement.,1,language for required recordings: the research...,0
3,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0
4,NON_permission_statement.,1,this consent form will be filed securely in an...,0


### Data Pre-processing

In [27]:
slim2 = df[['label', 'text']] # subset dataframe to make it easier to work with

In [28]:
vocabulary_size = 1000
maxlen = 100

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(slim2['text'])
sequences = tokenizer.texts_to_sequences(slim2['text'])
data = pad_sequences(sequences, maxlen=maxlen)

In [29]:
slim2['data'] = data.tolist()
# slim2['data'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
new_col_names = ['seq_posi' + str(i) for i in range(0,maxlen)]
pos_seq_df = pd.DataFrame(slim2['data'].values.tolist(), columns=new_col_names)
slim2 = pd.concat([slim2, pos_seq_df], axis=1)
# slim2.head()

### Train-Test Split


In [31]:
text_train, text_test = train_test_split(slim2,
                               stratify=slim2['label'],
                               test_size=0.3, 
                               random_state=1729)

### Convert Labels to Categorical

In [32]:
label_train = keras.utils.to_categorical(text_train['label'], num_classes=2)
label_test = keras.utils.to_categorical(text_test['label'], num_classes=2)

## Baseline Models

In [38]:
models = []

dtBase = DecisionTreeClassifier(max_depth=10, 
                               max_features=9,
                               class_weight={1:.2})

models.append(("DecisionTree",dtBase))

rdfBase = RandomForestClassifier(n_estimators=1000,
                                class_weight={1:.2})

models.append(("RandomForest",rdfBase))

models.append(("SVM",SVC(gamma='auto')))
models.append(("LogisticRegression", LogisticRegression(solver='liblinear',
                                  max_iter=1000,
                                  penalty='l1')))


In [54]:
fitted_models = []
new_rows = []

for name, model in models:
    model.fit(text_train[new_col_names], text_train['label'])
    fitted_models.append((name, model))
    prediction_vec = model.predict(text_test[new_col_names])
    row = {
        'Model:': name,
        'Accuracy:': accuracy_score(text_test['label'], prediction_vec),
        'Precision:': precision_score(text_test['label'], prediction_vec),
        'ROC:': roc_auc_score(text_test['label'], prediction_vec)
    }
    
    new_rows.append(row)

baseline_results_df = pd.DataFrame(new_rows)

In [55]:
# Naive baseline
majority_class = np.zeros(len(text_test['label']))

count = len(baseline_results_df) + 2

ac = accuracy_score(text_test['label'], majority_class)
pr = precision_score(text_test['label'], majority_class)
roc = roc_auc_score(text_test['label'], majority_class)

baseline_results_df.loc[count, 'Model:'] = 'Naive Majority Class Classifier'
baseline_results_df.loc[count, 'Accuracy:'] = ac
baseline_results_df.loc[count, 'Precision:'] = pr
baseline_results_df.loc[count, 'ROC:'] = roc

cols = ['Model:', 'Accuracy:', 'Precision:', 'ROC:']

baseline_results_df = baseline_results_df[cols]
baseline_results_df.head(len(baseline_results_df))

  'precision', 'predicted', average, warn_for)


Unnamed: 0,Model:,Accuracy:,Precision:,ROC:
0,DecisionTree,0.768054,0.238095,0.500799
1,RandomForest,0.790994,0.642857,0.557773
2,SVM,0.796092,0.866667,0.547433
3,LogisticRegression,0.772302,0.35,0.506255
6,Naive Majority Class Classifier,0.7774,0.0,0.5


## Original CNN (Model 1)

In [14]:
cnn_1 = Sequential()
cnn_1.add(Embedding(vocabulary_size, 100, input_length=maxlen))
cnn_1.add(Dropout(0.2))
cnn_1.add(Conv1D(64, 5, activation='relu'))
cnn_1.add(MaxPooling1D(pool_size=4))
cnn_1.add(LSTM(100))
cnn_1.add(Dense(2, activation='softmax'))
cnn_1.compile(loss='binary_crossentropy', 
               optimizer='adam', 
                metrics=['accuracy', 
                    custom_keras_metrics.keras_precision, 
                    custom_keras_metrics.keras_recall, 
                    custom_keras_metrics.keras_auc])

fit_cnn_1 = cnn_1.fit(text_train[new_col_names], 
                  label_train, 
                  validation_data=(text_test[new_col_names],label_test), 
                  epochs=5,
                  shuffle=True,
                  batch_size=3)
    

Train on 2746 samples, validate on 1177 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## CNN 2 - Without LSTM

In [20]:
cnn_2 = Sequential()
cnn_2.add(Embedding(vocabulary_size, 100, input_length=maxlen))
cnn_2.add(Dropout(0.2))
cnn_2.add(Conv1D(64, 5, activation='relu'))
cnn_2.add(MaxPooling1D(pool_size=4))
cnn_2.add(Flatten())
cnn_2.add(Dense(2, activation='softmax'))
cnn_2.compile(loss='binary_crossentropy', 
               optimizer='adam', 
                metrics=['accuracy', 
                    custom_keras_metrics.keras_precision, 
                    custom_keras_metrics.keras_recall, 
                    custom_keras_metrics.keras_auc])

fit_cnn_2 = cnn_2.fit(text_train[new_col_names], 
                  label_train, 
                  validation_data=(text_test[new_col_names],label_test), 
                  epochs=5,
                  shuffle=True,
                  batch_size=3)

Train on 2746 samples, validate on 1177 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## CNN 3 - LSTM without Convolution

In [24]:
cnn_3 = Sequential()
cnn_3.add(Embedding(vocabulary_size, 100, input_length=maxlen))
cnn_3.add(Dropout(0.2))
cnn_3.add(LSTM(100))
cnn_3.add(Dense(2, activation='softmax'))
cnn_3.compile(loss='binary_crossentropy', 
               optimizer='adam', 
                metrics=['accuracy', 
                    custom_keras_metrics.keras_precision, 
                    custom_keras_metrics.keras_recall, 
                    custom_keras_metrics.keras_auc])

fit_cnn_3 = cnn_3.fit(text_train[new_col_names], 
                  label_train, 
                  validation_data=(text_test[new_col_names],label_test), 
                  epochs=5,
                  shuffle=True,
                  batch_size=3)

Train on 2746 samples, validate on 1177 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Autoencoder

In [16]:
dim = text_train[new_col_names].shape[1]

auto_encoder_1 = Sequential()
auto_encoder_1.add(Dense(units=100, activation='relu', input_dim=dim))
auto_encoder_1.add(Dense(units=500, activation='softmax'))
auto_encoder_1.add(Dense(units=10, activation='softmax'))
auto_encoder_1.add(Dense(units=1000, activation='relu'))
auto_encoder_1.add(Dense(units=500, activation='relu'))
auto_encoder_1.add(Dense(units=2, activation='softmax'))

auto_encoder_1.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adagrad(lr=0.001), 
              metrics=['accuracy', 
                       custom_keras_metrics.keras_precision, 
                       custom_keras_metrics.keras_recall, 
                       custom_keras_metrics.keras_auc])

fit_auto_encoder_1 = auto_encoder_1.fit(text_train[new_col_names], 
                  label_train, 
                  validation_data=(text_test[new_col_names],label_test), 
                epochs=5,
                batch_size=3,
                shuffle=True)

Train on 2746 samples, validate on 1177 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
