In [1]:
import datasets
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
import evaluate

import numpy as np
import pandas as pd

In [2]:
# pip install transformers datasets accelerate evaluate scikit-learn

In [3]:
df = pd.read_csv("train_sections_data.csv", encoding_errors='ignore')

In [4]:
df = df.iloc[:, 0:10]

In [5]:
df

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,New Times Roman,1
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,New Times Roman,1
2,-6,False,False,False,113.5,122.2,409.3,414.9,New Times Roman,0
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,New Times Roman,0
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,New Times Roman,0
...,...,...,...,...,...,...,...,...,...,...
14210,The scheme's assets comprise a Friends Provide...,False,False,False,36.6,553.1,575.5,594.9,New Times Roman,0
14211,Transaction and Litigation Costs,False,True,True,42.7,151.4,91.0,96.7,New Times Roman,1
14212,The weighted-average fair value of options gra...,False,False,False,33.1,806.9,106.1,121.3,New Times Roman,0
14213,The analysis of gross contractual cash flow di...,False,False,False,42.6,553.1,667.0,687.0,New Times Roman,0


In [6]:
df.columns[df.isna().any()].tolist()

['Text']

In [7]:
df = df.dropna()

In [12]:
df

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,New Times Roman,1
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,New Times Roman,1
2,-6,False,False,False,113.5,122.2,409.3,414.9,New Times Roman,0
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,New Times Roman,0
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,New Times Roman,0
...,...,...,...,...,...,...,...,...,...,...
14210,The scheme's assets comprise a Friends Provide...,False,False,False,36.6,553.1,575.5,594.9,New Times Roman,0
14211,Transaction and Litigation Costs,False,True,True,42.7,151.4,91.0,96.7,New Times Roman,1
14212,The weighted-average fair value of options gra...,False,False,False,33.1,806.9,106.1,121.3,New Times Roman,0
14213,The analysis of gross contractual cash flow di...,False,False,False,42.6,553.1,667.0,687.0,New Times Roman,0


In [9]:
df['FontType'].unique()

array(['New Times Roman'], dtype=object)

In [15]:
from sklearn.utils import resample

df_majority = df[df.Label==0]
df_minority = df[df.Label==1]


df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)


df_upsampled = pd.concat([df_majority, df_minority_upsampled])


df_upsampled.Label.value_counts()

0    9888
1    9888
Name: Label, dtype: int64

In [16]:
df = df_upsampled

In [17]:
df['updatedText'] = ''

In [18]:
df

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label,updatedText
2,-6,False,False,False,113.5,122.2,409.3,414.9,New Times Roman,0,
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,New Times Roman,0,
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,New Times Roman,0,
5,A substantial portion of contract and administ...,False,False,False,33.1,808.6,165.5,189.2,New Times Roman,0,
6,PMA applications must be supported by valid sc...,False,False,False,112.4,724.9,261.4,336.2,New Times Roman,0,
...,...,...,...,...,...,...,...,...,...,...,...
6816,Overview,True,False,False,112.9,144.0,300.7,306.2,New Times Roman,1,
5362,OTHER ASSETS,True,False,False,105.0,179.4,547.1,553.9,New Times Roman,1,
6614,Consolidation,True,False,False,33.1,78.4,322.3,327.8,New Times Roman,1,
4655,Long-lived Assets,False,False,False,34.3,93.2,366.0,371.7,New Times Roman,1,


In [19]:
bold_titel, bold_text = [], []
italic_titel, italic_text = [], []
underlined_titel, underlined_text = [], []

for i, row in df.iterrows():
    text = row['Text']
    if row['IsBold']:
        text += ' bold'
    if row['IsItalic']:
        text += ' italic'
    if row['IsUnderlined']:
        text += ' underlined'

    df.loc[i, 'updatedText'] = text

    if row['IsBold'] and row['Label']:
        bold_titel.append(i)
    elif row['IsBold'] and not row['Label']:
        bold_text.append(i)

    if row['IsItalic'] and row['Label']:
        italic_titel.append(i)
    elif row['IsItalic'] and not row['Label']:
        italic_text.append(i)

    if row['IsUnderlined'] and row['Label']:
        underlined_titel.append(i)
    elif row['IsUnderlined'] and not row['Label']:
        underlined_text.append(i)

In [20]:
print(len(bold_titel), len(bold_text))
print(len(italic_titel), len(italic_text))
print(len(underlined_titel), len(underlined_text))

8347 219
2251 215
306 11


In [21]:
df.iloc[13579]['updatedText']

'ANNUAL REPORT bold'

In [22]:
dataset = datasets.Dataset.from_pandas(df)

In [23]:
dataset.features

{'Text': Value(dtype='string', id=None),
 'IsBold': Value(dtype='bool', id=None),
 'IsItalic': Value(dtype='bool', id=None),
 'IsUnderlined': Value(dtype='bool', id=None),
 'Left': Value(dtype='float64', id=None),
 'Right': Value(dtype='float64', id=None),
 'Top': Value(dtype='float64', id=None),
 'Bottom': Value(dtype='float64', id=None),
 'FontType': Value(dtype='string', id=None),
 'Label': Value(dtype='int64', id=None),
 'updatedText': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [24]:
title_class = ClassLabel(num_classes = 2)

In [25]:
dataset = dataset.cast_column("Label", title_class)

Casting the dataset:   0%|          | 0/19776 [00:00<?, ? examples/s]

In [26]:
dataset = dataset.rename_column("Label", "label")

In [27]:
dataset.features

{'Text': Value(dtype='string', id=None),
 'IsBold': Value(dtype='bool', id=None),
 'IsItalic': Value(dtype='bool', id=None),
 'IsUnderlined': Value(dtype='bool', id=None),
 'Left': Value(dtype='float64', id=None),
 'Right': Value(dtype='float64', id=None),
 'Top': Value(dtype='float64', id=None),
 'Bottom': Value(dtype='float64', id=None),
 'FontType': Value(dtype='string', id=None),
 'label': ClassLabel(names=['0', '1'], id=None),
 'updatedText': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [28]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
def tokenize_function(examples):
    return tokenizer(examples["updatedText"], padding="max_length", truncation=True)

In [30]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/19776 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets

Dataset({
    features: ['Text', 'IsBold', 'IsItalic', 'IsUnderlined', 'Left', 'Right', 'Top', 'Bottom', 'FontType', 'label', 'updatedText', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 19776
})

In [32]:
train_valid = tokenized_datasets.train_test_split(test_size=0.2, shuffle=True, stratify_by_column="label")

In [33]:
train_valid

DatasetDict({
    train: Dataset({
        features: ['Text', 'IsBold', 'IsItalic', 'IsUnderlined', 'Left', 'Right', 'Top', 'Bottom', 'FontType', 'label', 'updatedText', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 15820
    })
    test: Dataset({
        features: ['Text', 'IsBold', 'IsItalic', 'IsUnderlined', 'Left', 'Right', 'Top', 'Bottom', 'FontType', 'label', 'updatedText', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 3956
    })
})

In [34]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [36]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [37]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=300,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [38]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    tokenizer=tokenizer,
    train_dataset=train_valid['train'],         # training dataset
    eval_dataset=train_valid['test'],             # evaluation dataset
    compute_metrics=compute_metrics,
)

In [39]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0761,0.093216,0.971183
2,0.077,0.062024,0.979272
3,0.0616,0.054022,0.98458
4,0.0439,0.064816,0.983064
5,0.0486,0.067861,0.983316


TrainOutput(global_step=2475, training_loss=0.0673183066458112, metrics={'train_runtime': 4035.4714, 'train_samples_per_second': 19.601, 'train_steps_per_second': 0.613, 'total_flos': 1.04781712336896e+16, 'train_loss': 0.0673183066458112, 'epoch': 5.0})

In [40]:
df_test = pd.read_csv("test_sections_data.csv", encoding_errors='ignore')

In [41]:
df_test

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label
0,Employee Involvement,True,False,False,77.9,175.6,425.3,434.2,New Times Roman,1
1,Income Taxes,True,True,False,33.1,79.6,284.8,290.6,New Times Roman,1
2,,False,False,False,77.9,81.3,346.1,355.0,New Times Roman,0
3,the Audit & Remuneration Committee,False,True,False,124.8,247.3,380.3,385.9,New Times Roman,0
4,Industry Overview,True,False,False,33.1,93.5,266.5,272.0,New Times Roman,1
...,...,...,...,...,...,...,...,...,...,...
1574,The retail apparel industry is highly competit...,False,False,False,33.1,808.5,68.3,109.9,New Times Roman,0
1575,Gary Barrett joined AngioDynamics in May 2014 ...,False,False,False,33.1,803.7,262.8,286.4,New Times Roman,0
1576,Each of our products is offered to customers o...,False,False,False,33.1,770.1,184.1,199.4,New Times Roman,0
1577,"Includes corporate debt instruments (74%), mor...",False,False,False,128.5,728.7,242.4,257.6,New Times Roman,0


In [42]:
df_test.columns[df_test.isna().any()].tolist()

['Text']

In [43]:
df_test = df_test.dropna()

In [44]:
df_test

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label
0,Employee Involvement,True,False,False,77.9,175.6,425.3,434.2,New Times Roman,1
1,Income Taxes,True,True,False,33.1,79.6,284.8,290.6,New Times Roman,1
3,the Audit & Remuneration Committee,False,True,False,124.8,247.3,380.3,385.9,New Times Roman,0
4,Industry Overview,True,False,False,33.1,93.5,266.5,272.0,New Times Roman,1
5,Lonza Agreement,True,True,False,112.4,169.8,307.3,313.1,New Times Roman,1
...,...,...,...,...,...,...,...,...,...,...
1574,The retail apparel industry is highly competit...,False,False,False,33.1,808.5,68.3,109.9,New Times Roman,0
1575,Gary Barrett joined AngioDynamics in May 2014 ...,False,False,False,33.1,803.7,262.8,286.4,New Times Roman,0
1576,Each of our products is offered to customers o...,False,False,False,33.1,770.1,184.1,199.4,New Times Roman,0
1577,"Includes corporate debt instruments (74%), mor...",False,False,False,128.5,728.7,242.4,257.6,New Times Roman,0


In [45]:
df_test['FontType'].unique()

array(['New Times Roman'], dtype=object)

In [46]:
df_test['updatedText'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['updatedText'] = ''


In [47]:
bold_titel, bold_text = [], []
italic_titel, italic_text = [], []
underlined_titel, underlined_text = [], []

for i, row in df_test.iterrows():
    text = row['Text']
    if row['IsBold']:
        text += ' bold'
    if row['IsItalic']:
        text += ' italic'
    if row['IsUnderlined']:
        text += ' underlined'

    df_test.loc[i, 'updatedText'] = text

    if row['IsBold'] and row['Label']:
        bold_titel.append(i)
    elif row['IsBold'] and not row['Label']:
        bold_text.append(i)

    if row['IsItalic'] and row['Label']:
        italic_titel.append(i)
    elif row['IsItalic'] and not row['Label']:
        italic_text.append(i)

    if row['IsUnderlined'] and row['Label']:
        underlined_titel.append(i)
    elif row['IsUnderlined'] and not row['Label']:
        underlined_text.append(i)

In [48]:
print(len(bold_titel), len(bold_text))
print(len(italic_titel), len(italic_text))
print(len(underlined_titel), len(underlined_text))

350 19
113 21
17 2


In [49]:
test_dataset = datasets.Dataset.from_pandas(df_test)

In [50]:
test_dataset

Dataset({
    features: ['Text', 'IsBold', 'IsItalic', 'IsUnderlined', 'Left', 'Right', 'Top', 'Bottom', 'FontType', 'Label', 'updatedText', '__index_level_0__'],
    num_rows: 1496
})

In [51]:
title_class = ClassLabel(num_classes = 2)
test_dataset = test_dataset.cast_column("Label", title_class)

Casting the dataset:   0%|          | 0/1496 [00:00<?, ? examples/s]

In [52]:
test_dataset = test_dataset.rename_column("Label", "label")

In [53]:
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1496 [00:00<?, ? examples/s]

In [54]:
test_tokenized_datasets

Dataset({
    features: ['Text', 'IsBold', 'IsItalic', 'IsUnderlined', 'Left', 'Right', 'Top', 'Bottom', 'FontType', 'label', 'updatedText', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1496
})

In [55]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [56]:
trainer.predict(test_tokenized_datasets)

PredictionOutput(predictions=array([[-3.0535157 ,  2.9586034 ],
       [-3.249946  ,  3.2051532 ],
       [ 1.1665748 , -0.99915504],
       ...,
       [ 4.6252027 , -3.8159842 ],
       [ 4.5212407 , -3.8145194 ],
       [ 1.0959167 , -0.9361274 ]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.088981993496418, 'test_accuracy': 0.9725935828877005, 'test_runtime': 25.9455, 'test_samples_per_second': 57.659, 'test_steps_per_second': 0.925})

In [None]:
pipe = pipeline(task= 'sentiment-analysis', model="/content/results/checkpoint-1700")

In [None]:
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7ac41a53b5b0>

In [None]:
output = pipe("Employee Involvement bold")

In [None]:
output

[{'label': 'LABEL_1', 'score': 0.9976553916931152}]