In [0]:
!git clone https://github.com/adobe-research/deft_corpus.git

Cloning into 'deft_corpus'...
remote: Enumerating objects: 894, done.[K
remote: Counting objects: 100% (894/894), done.[K
remote: Compressing objects: 100% (462/462), done.[K
remote: Total 2196 (delta 601), reused 669 (delta 424), pack-reused 1302[K
Receiving objects: 100% (2196/2196), 42.39 MiB | 5.42 MiB/s, done.
Resolving deltas: 100% (1386/1386), done.


In [0]:
!unzip src.zip

Archive:  src.zip
   creating: scripts/
  inflating: scripts/__init__.py     
  inflating: scripts/task1_converter.py  
   creating: source/
  inflating: source/__init__.py      
  inflating: source/classifiers.py   
  inflating: source/data_loader.py   
  inflating: source/text_vectorizers.py  
  inflating: Data Loading and Preparation.ipynb  
  inflating: README.md               


# Loading The Data

In [0]:
from source.data_loader import DeftCorpusLoader

In [0]:
loader = DeftCorpusLoader('deft_corpus/data')

In [0]:
train_df, dev_df = loader.load_classification_data()

In [0]:
train_df.head()

Unnamed: 0,Sentence,HasDef
0,6110 . Defining obscenity has been something ...,0
1,"Into the early twentieth century , written wo...",0
2,"In 1973 , the Supreme Court established the M...",1
3,"Miller v. California , 413 U.S. 15 ( 1973 ) .",0
4,"However , the application of this standard ha...",0


# Imports

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, GlobalMaxPooling1D, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

In [0]:
loader.preprocess_data(train_df)

In [0]:
loader.clean_data(train_df)

In [0]:
loader.preprocess_data(dev_df)

In [0]:
loader.clean_data(dev_df)

In [0]:
train_df.head()

Unnamed: 0,Sentence,HasDef,Parsed
0,6110 . Defining obscenity has been something ...,0,"[defining, obscenity, challenge, court, suprem..."
1,"Into the early twentieth century , written wo...",0,"[early, 20, century, write, work, frequently, ..."
2,"In 1973 , the Supreme Court established the M...",1,"[supreme, court, establish, miller, test, deci..."
5,"In particular , the concept of “ contemporary...",0,"[particular, concept, contemporary, community,..."
6,6113 . Free expression includes the right to ...,0,"[free, expression, include, right, assemble, p..."


In [0]:
vocab = np.unique([y for x in train_df['Parsed'] for y in x])

In [0]:
vocab_size = len(vocab)

In [0]:
max_length = np.max([np.count_nonzero(x) for x in train_df['Parsed']])

In [0]:
avg_length = int(np.ceil(np.average([np.count_nonzero(x) for x in train_df['Parsed']])))

In [0]:
len(train_df['HasDef'])

16165

In [0]:
train_positive_class_length = np.count_nonzero([x for x in train_df['HasDef'] if x == 1])
train_negative_class_length = np.abs(len(train_df['HasDef']) - train_positive_class_length)

In [0]:
MAX_NB_WORDS = vocab_size    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = avg_length # max length of each entry (sentence), including padding
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove.6B."+str(EMBEDDING_DIM)+"d.txt"

# SBERT Pretrained

BERT (Bidirectional Encoder Representations from Transformers) is a paper published by researchers at Google AI Language. It has caused a stir in the Machine Learning community by presenting state-of-the-art results in a wide variety of NLP tasks.
BERT produces out-of-the-box rather bad sentence embeddings. Sentence BERT paper fine-tunes BERT / RoBERTa / DistilBERT / ALBERT / XLNet with a siamese or triplet network structure to produce semantically meaningful sentence embeddings that can be used in unsupervised scenarios. The embdedings are used to encode the documents and use a naive bayes classifier to classify the dev dataset

In [0]:
!pip install -U sentence-transformers

In [0]:
train_parsed_sentences = [" ".join(x) for x in train_df['Parsed']]

In [0]:
dev_parsed_sentences = [" ".join(x) for x in dev_df['Parsed']]

In [0]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-large-nli-mean-tokens')

100%|██████████| 1.24G/1.24G [02:18<00:00, 8.99MB/s]


In [0]:
train_sbert_embeddings = model.encode(train_parsed_sentences)

In [0]:
dev_sbert_embeddings = model.encode(dev_parsed_sentences)

In [0]:
len(train_sbert_embeddings)

16165

## Naive Bayes

In [0]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(train_sbert_embeddings, train_df['HasDef'].values)

y_pred = nb.predict(dev_sbert_embeddings)

print('Dev classification report:\n {}'.format(classification_report(dev_df['HasDef'].values, y_pred)))

Dev classification report:
               precision    recall  f1-score   support

           0       0.75      0.60      0.67       509
           1       0.46      0.63      0.53       271

    accuracy                           0.61       780
   macro avg       0.60      0.61      0.60       780
weighted avg       0.65      0.61      0.62       780

