In [1]:
import warnings
warnings.filterwarnings('ignore')
import json
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import re
from lightgbm import LGBMClassifier

In [2]:
PUNCT_PATTERN = "-/():.,;"
YEAR_PATTERN = r"\b(19|20)\d{2}\b"  # 1900 - 2099
DDMM_PATTERN = r"\d{2}/\d{2}"  # 00/00 - 99/99

punct_remover = str.maketrans(PUNCT_PATTERN, " " * len(PUNCT_PATTERN))


def words_from_values(values):
    """
    Extract words from values
    """
    
    return [y for y in sum([x.lower().translate(punct_remover).split() for x in values], []) if y and not re.search(r"\d", y)]


def get_words(data):
    """
    Collect all the words extracted from values for the whole provided data
    """

    words = []
    for obj in tqdm(data):
        words.extend(words_from_values(obj['values']))
        
    return words


def words_counter(data):
    """
    Count popularity of words of provided data
    """

    words = get_words(data)
    
    return {x: y / len(words) for x, y in Counter(words).items()}

### Data reading and preparation

In [3]:
# Reading data
with open('./document-standardization-training-dataset.txt', 'r', encoding="utf8") as f:
    data = [json.loads(x) for x in f.readlines()]

In [4]:
# Concat all row of the data
data_all = []
for row in tqdm(data):
    data_all.extend(row)

HBox(children=(FloatProgress(value=0.0, max=14932.0), HTML(value='')))




In [5]:
# Get a subset of the data for easier demostration
sample_sz = 150000
data_sampled = np.random.choice(data_all, size=sample_sz, replace=False)

Simplify data structure

```json
{'values': [
    {'value':'a'}, 
    {'value': 'b'}
  ],
  'type': 'HEADERS'
}

to

{'values': ['a', 'b'], 'type': 1}
```

In [6]:
data_sampled = [{'values':[x['value'] for x in x['values']], 'type': x['type'] == 'HEADERS'} for x in data_sampled]

### Data analysis and feature extraction

At first glance, several noticeable characteristics can be observed in the data, which distinguish HEADERS from other objects.
1. Sparcity: amount of empty values
2. Number of floats
3. Number of long ints
4. Size is not distinguishing by itself, but working other features it could make sence for boosting algorithm

We are starting with collecting that features

<i>* It is important to note that all the features can be collected using a single loop. However, for presentation purposes, and better explanation, we have separated them into multiple loops.</i>

In [7]:
# Creating empty dataframe to collect features
df = pd.DataFrame()

#### Collecting first features

In [8]:
types = []
sparsity = []
floats_N = []
long_ints = []
size = []
for row in tqdm(data_sampled):
    types.append(row['type'])
    
    values = np.array(row['values'])
    
    size.append(len(values))
    
    sparsity.append((values == '').sum() / len(values))
    
    floats_N.append(sum(np.vectorize(lambda x: x.replace('.', '', 1).isdigit() and '.' in x)(values)) / len(values))
    
    long_ints.append(sum(np.vectorize(lambda x: x.replace('.', '', 1).isdigit() and len(x) > 4)(values)) / len(values))
    
df['y'] = types
df['sparsity'] = sparsity
df['floats_N'] = floats_N
df['long_ints'] = long_ints
df['size'] = size

HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




#### Linguistic patterns detection
Second thing to come in mind is to find additional linguistic patterns distinguishing HEADERS from others. First of all separate data to HEADERS and not HEADERS

In [9]:
# Objects with type='HEADERS'
data_headers = [x for x in data_sampled if x['type']]
# Other objects
data_others = [x for x in data_sampled if not x['type']]

Then get words popularity in each sectors

In [10]:
header_words = words_counter(data_headers)

HBox(children=(FloatProgress(value=0.0, max=4378.0), HTML(value='')))




In [11]:
other_words = words_counter(data_others)

HBox(children=(FloatProgress(value=0.0, max=145622.0), HTML(value='')))




Then we calculate the weight of the word in sense of how it correspond to HEADERS by following formula:

$ph$ - word popularity in headers

$po$ - word popularity in others
$$
weight = \frac{ph-po}{ph+po}
$$

In [12]:
all_words = set(get_words(data_sampled))

HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




In [13]:
word_weights = {}
for word in all_words:
    h = header_words.get(word, 0)
    o = other_words.get(word, 0) 
    word_weights[word] = (h - o) / (h + o)

Finally we get mean, max and std vaues of weights of all words or the object

In [14]:
word_weights_mean = []
word_weights_max = []
word_weights_std = []
for row in tqdm(data_sampled):
    values = np.array(row['values'])
    
    word_weights_ = np.array([word_weights.get(word, 0) for word in words_from_values(values)])
    if len(word_weights_) > 0:
        word_weights_mean.append(word_weights_.mean())
        word_weights_max.append(word_weights_.max())
        word_weights_std.append(word_weights_.std())
    else:
        word_weights_mean.append(np.nan)
        word_weights_max.append(np.nan)
        word_weights_std.append(np.nan)
        
df['word_weights_mean'] = word_weights_mean
df['word_weights_max'] = word_weights_max
df['word_weights_std'] = word_weights_std


HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




#### Dates pattern detection
Moving forward, we notice that headers object contains dates in format of 'yyyy' and 'dd/mm'. So we add two features more denoting amount of dates in the values

In [15]:
years_N = []
ddmm_N = []
for row in tqdm(data_sampled):
    values = np.array(row['values'])
        
    string = ' '.join(values)
    if len(string) > 0:
        years_N.append(len(re.findall(YEAR_PATTERN, string)) / len(string))
        ddmm_N.append(len(re.findall(DDMM_PATTERN, string)) / len(string))
    else:
        years_N.append(np.nan)
        ddmm_N.append(np.nan)
        
df['years_N'] = years_N
df['ddmm_N'] = ddmm_N

HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




### Training the model
It is known that gradient boosting algorithms performs better for such kind of data. We get LightGMB classifier as fastest performing for such kind of data 

In [16]:
X = df.drop('y', axis=1).drop('size', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=20, max_depth=10, learning_rate=0.1, n_estimators=500)
clf.fit(X_train, y_train)

### Testing
We get simple accuracy score for testing our result (true / all). As we have strongly inbalanced data it's reasonable to compare result with dummy classifier, which just predict no HEADERS for all objects

In [18]:
y_pred = clf.predict(X_test)

In [19]:
acc = accuracy_score(y_test, y_pred)
acc_fake = accuracy_score(y_test, [False]* len(y_test))
print("Accuracy:", acc)
print('Dummy Acc.:', acc_fake)

Accuracy: 0.9940333333333333
Dummy Acc.: 0.9714


Also it's reasonable to test accuracy only for HEADERS, i.e. how much HEADERS was correctly classified

In [20]:
print(accuracy_score([1] * y_test.sum(), y_pred[y_test]))

0.8578088578088578


For better understanding of our model performance, precission, recall and f1-score could be calculated

In [21]:
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))
print('f1-score', f1_score(y_test, y_pred))

precision 0.9281210592686002
recall 0.8578088578088578
f1-score 0.8915808600847971


It is important to note that for specific needs and requirements, it may be necessary to use different evaluation metrics to ensure that the model meets the desired objectives. For example, if correctly classifying all HEADERS is critical, while the correct classification of other labels is less important, then the model's balance may need to be adjusted accordingly to optimize for this specific metric.

### Future improvements

1. Like in the majority of machine learning models, increasing the size of the training data can lead to improved performance and reduced overfitting. This is because a larger dataset provides a more diverse range of examples for the model to learn from, helping it to better generalize to new, unseen data.
2. Basic data object descriptions improve data understanding and meaningful pattern identification, leading to better performance and more accurate predictions.
3. Using advanced functions and aggregation methods (other than max, mean, std) to calculate words' weights.
4. Search linguistic patterns not only to words but also to word2vecs, so the model will better work in case of unseen words.
5. Search linguistic patterns not to words but to values. Better cleanup teqniques should be implemented.
6. Create word classes (e.g. Jan, Feb, ... to class month) and search linguistic patterns in classes.
7. Continuosly browsing other models, playing with parameters and fine tuning the model