In [1]:
from itertools import chain

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from joblib import dump

from utils.lemma_tokenizer import LemmaTokenizer

In [19]:
df = pd.read_csv('./data/glaw/glaw_data.csv', index_col=0)
df.head()

Unnamed: 0,candidates,labels
0,['CO-BRANDING AND ADVERTISING AGREEMENT THIS C...,"[0, 0, 0, 1, 0]"
1,['EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDE...,"[0, 0, 1, 0]"
2,['Offices of the Joint Venture. The principal ...,"[0, 0, 1]"
3,['Exhibit 10.31 PURSUANT TO 17 C.F.R. § 240.24...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,['25. APPLICABLE LAW This Agreement shall be g...,"[1, 0, 0, 0]"


In [20]:
df['candidates'] = df['candidates'].apply(eval)
df['labels'] = df['labels'].apply(eval)
print(type(df.iloc[0]['candidates']))
print(type(df.iloc[0]['labels']))

<class 'list'>
<class 'list'>


In [4]:
data_version = 0

In [5]:
pattern_token = r'(?u)(\b[a-z]{2,}\b)'
tokenizer = LemmaTokenizer(pattern_token)
stop_words_lematized = [tokenizer.lemmatizer.lemmatize(word) for word in ENGLISH_STOP_WORDS]
stop_words_lematized.append('shall')
vectorizer = CountVectorizer(
    tokenizer=tokenizer,
    strip_accents='unicode',
    lowercase=True,
    stop_words=stop_words_lematized,
    ngram_range=(1, 2),
    min_df=10,
    binary=True
)

In [6]:
all_text_list = chain.from_iterable(df['candidates'])
vectorizer.fit(all_text_list)
len(vectorizer.vocabulary_)



4135

In [21]:
data_vectorized = []
for idx, row in df.iterrows():
    doc_candidate_list = row['candidates']
    labels = np.asarray(row['labels'])

    text_vectorized = vectorizer.transform(doc_candidate_list)
    mask = text_vectorized.getnnz(axis=1) > 1
    text_vectorized = text_vectorized[mask]
    labels = labels[mask]

    if 1 in labels:
        # only if at least 1 positive example present
        row.at['labels'] = list(labels)
        data_vectorized.append(text_vectorized)
len(data_vectorized)

[0, 0, 1, 0]
[0, 0, 1]


429

In [22]:
vectorizer_params = {
    'vocabulary_': vectorizer.vocabulary_,
}
dump(vectorizer_params, f'./data/glaw/data_headers_{data_version}.sav')

['./data/glaw/data_headers_0.sav']

In [23]:
dump(data_vectorized, f'./data/glaw/data_vectorized_{data_version}.sav')

['./data/glaw/data_vectorized_0.sav']

In [24]:
df.to_csv('./data/glaw/labels.csv', columns=['labels'])