In [1]:
!git clone https://github.com/adobe-research/deft_corpus.git

Cloning into 'deft_corpus'...
remote: Enumerating objects: 894, done.[K
remote: Counting objects: 100% (894/894), done.[K
remote: Compressing objects: 100% (462/462), done.[K
remote: Total 2196 (delta 601), reused 669 (delta 424), pack-reused 1302[K
Receiving objects: 100% (2196/2196), 42.39 MiB | 5.42 MiB/s, done.
Resolving deltas: 100% (1386/1386), done.


In [2]:
!unzip src.zip

Archive:  src.zip
   creating: scripts/
  inflating: scripts/__init__.py     
  inflating: scripts/task1_converter.py  
   creating: source/
  inflating: source/__init__.py      
  inflating: source/classifiers.py   
  inflating: source/data_loader.py   
  inflating: source/text_vectorizers.py  
  inflating: Data Loading and Preparation.ipynb  
  inflating: README.md               


# Loading The Data

In [0]:
from source.data_loader import DeftCorpusLoader

In [0]:
loader = DeftCorpusLoader('deft_corpus/data')

In [0]:
train_df, dev_df = loader.load_classification_data()

In [75]:
train_df.head()

Unnamed: 0,Sentence,HasDef
0,6110 . Defining obscenity has been something ...,0
1,"Into the early twentieth century , written wo...",0
2,"In 1973 , the Supreme Court established the M...",1
3,"Miller v. California , 413 U.S. 15 ( 1973 ) .",0
4,"However , the application of this standard ha...",0


# Imports

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, GlobalMaxPooling1D, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

In [0]:
loader.preprocess_data(train_df)

In [0]:
loader.clean_data(train_df)

In [0]:
loader.preprocess_data(dev_df)

In [0]:
loader.clean_data(dev_df)

In [106]:
train_df.head()

Unnamed: 0,Sentence,HasDef,Parsed
0,6110 . Defining obscenity has been something ...,0,"[defining, obscenity, challenge, court, suprem..."
1,"Into the early twentieth century , written wo...",0,"[early, 20, century, write, work, frequently, ..."
2,"In 1973 , the Supreme Court established the M...",1,"[supreme, court, establish, miller, test, deci..."
5,"In particular , the concept of “ contemporary...",0,"[particular, concept, contemporary, community,..."
6,6113 . Free expression includes the right to ...,0,"[free, expression, include, right, assemble, p..."


In [0]:
vocab = np.unique([y for x in train_df['Parsed'] for y in x])

In [0]:
vocab_size = len(vocab)

In [0]:
max_length = np.max([np.count_nonzero(x) for x in train_df['Parsed']])

In [0]:
avg_length = int(np.ceil(np.average([np.count_nonzero(x) for x in train_df['Parsed']])))

In [110]:
len(train_df['HasDef'])

16165

In [0]:
train_positive_class_length = np.count_nonzero([x for x in train_df['HasDef'] if x == 1])
train_negative_class_length = np.abs(len(train_df['HasDef']) - train_positive_class_length)

In [0]:
MAX_NB_WORDS = vocab_size    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = avg_length # max length of each entry (sentence), including padding
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove.6B."+str(EMBEDDING_DIM)+"d.txt"

# Doc2Vec Model

## Buliding Vocab and Training

In [0]:
train_tagged = train_df.apply(lambda x: TaggedDocument(words= x['Parsed'], tags= str(x['HasDef'])), axis=1)

In [0]:
dev_tagged = dev_df.apply(lambda x: TaggedDocument(words= x['Parsed'], tags= str(x['HasDef'])), axis=1)

In [20]:
train_tagged.values

array([TaggedDocument(words=['defining', 'obscenity', 'challenge', 'court', 'supreme', 'court', 'justice', 'potter', 'stewart', 'famously', 'say', 'obscenity', 'have', 'watch', 'pornography', 'supreme', 'court', 'build', 'know'], tags='0'),
       TaggedDocument(words=['early', '20', 'century', 'write', 'work', 'frequently', 'ban', 'obscene', 'include', 'work', 'note', 'author', 'james', 'joyce', 'henry', 'miller', 'today', 'rare', 'court', 'uphold', 'obscenity', 'charge', 'write', 'material'], tags='0'),
       TaggedDocument(words=['supreme', 'court', 'establish', 'miller', 'test', 'decide', 'obscene', 'average', 'person', 'apply', 'contemporary', 'community', 'standard', 'find', 'work', 'take', 'appeal', 'prurient', 'interest', 'b', 'work', 'depict', 'describe', 'patently', 'offensive', 'way', 'sexual', 'conduct', 'specifically', 'define', 'applicable', 'state', 'law', 'c', 'work', 'take', 'lack', 'literary', 'artistic', 'political', 'scientific', 'value'], tags='1'),
       ...,
  

In [21]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, epochs=30)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 16165/16165 [00:00<00:00, 1645334.02it/s]


In [22]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065, epochs=30)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 16165/16165 [00:00<00:00, 1604831.57it/s]


In [23]:
%%time
model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=model_dbow.epochs)

100%|██████████| 16165/16165 [00:00<00:00, 949858.84it/s]


CPU times: user 40.9 s, sys: 6.86 s, total: 47.8 s
Wall time: 30 s


In [24]:
model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=model_dmm.epochs)

100%|██████████| 16165/16165 [00:00<00:00, 1290389.28it/s]


In [0]:
def vec_for_learning(model, tagged_docs):
    labels, vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=model.epochs)) for doc in tagged_docs.values])
    return labels, vectors

In [26]:
len(train_tagged)

16165

In [0]:
y_train_dbow, X_train_dbow = vec_for_learning(model_dbow, train_tagged)
y_dev_dbow, X_dev_dbow = vec_for_learning(model_dbow, dev_tagged)

In [0]:
y_train_dmm, X_train_dmm = vec_for_learning(model_dmm, train_tagged)
y_dev_dmm, X_dev_dmm = vec_for_learning(model_dmm, dev_tagged)

## Infernece Step

### Naive Bayes

In [30]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_dbow, y_train_dbow)

y_pred_dbow = nb.predict(X_dev_dbow)

print('Dev Classification Report:\n {}'.format(classification_report(y_dev_dbow, y_pred_dbow)))

Dev Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       509
           1       0.59      0.64      0.61       271

    accuracy                           0.72       780
   macro avg       0.69      0.70      0.70       780
weighted avg       0.73      0.72      0.72       780



### Linear SVC

In [31]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svc = LinearSVC(class_weight='balanced')
svc.fit(X_train_dbow, y_train_dbow)
y_pred_dbow_svc = svc.predict(X_dev_dbow)


print('Dev accuracy %s' % accuracy_score(y_dev_dbow, y_pred_dbow_svc))
print('Dev classification report:\n {}'.format(classification_report(y_dev_dbow, y_pred_dbow_svc)))


Dev accuracy 0.7064102564102565
Dev F1 score:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77       509
           1       0.57      0.63      0.60       271

    accuracy                           0.71       780
   macro avg       0.68      0.69      0.68       780
weighted avg       0.71      0.71      0.71       780



### Logistic Regresssion

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train_dbow, y_train_dbow)
y_pred_dbow_logreg = logreg.predict(X_dev_dbow)

print('Dev accuracy %s' % accuracy_score(y_dev_dbow, y_pred_dbow_logreg))
print('Dev classification report:\n {}'.format(classification_report(y_dev_dbow, y_pred_dbow_logreg)))


Dev accuracy 0.7115384615384616
Dev classification report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77       509
           1       0.58      0.63      0.60       271

    accuracy                           0.71       780
   macro avg       0.69      0.69      0.69       780
weighted avg       0.72      0.71      0.71       780

