# Labeling ESA tables with VEC labels

In [3]:
import pandas as pd
import numpy as np
import os
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling.lf.nlp import nlp_labeling_function
from snorkel.preprocess import preprocessor
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import DaskLFApplier
from sklearn.model_selection import train_test_split
import spacy
from spacy.matcher import Matcher
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, BackboneFinetuning, ModelPruning
RANDOM_SEED = 42
pl.seed_everything(RANDOM_SEED)

# weights and biases
import wandb

# lightning plus wandb
from pytorch_lightning.loggers import WandbLogger

Global seed set to 42


In [4]:
!nvidia-smi

Mon Jun  7 20:10:55 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.89       Driver Version: 465.89       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Quadro R... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   60C    P8     9W /  N/A |   1223MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
esa_df = pd.read_csv('./data/esa_index_with_table_text_no_labels.csv')

In [22]:
esa_df.drop(columns=['Unnamed: 0', 'Index'], inplace=True)

In [23]:
esa_df.head()

Unnamed: 0,Title,Content Type,Application Name,Application Short Name,Application Filing Date,Company Name,Commodity,File Name,ESA Folder URL,Document Number,...,PDF Page Number,PDF Page Count,PDF Size,PDF Outline,Download folder name,Zipped Project Link,Missing CSV,CSV Filename,csvFileName,csvText
0,TABLE 3 SUMMARY OF AQUATICS FIELD WORK AND ABO...,Table,Application for North Montney Project,North Montney,11/8/2013,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A3Q6H2,...,14,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,False,nrthmntn_table-3-summary-of-aquatics-field-wor...,1059614_14_lattice-v_1.csv,Survey Date Aboriginal Communities De...
1,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,Table,Application for North Montney Project,North Montney,11/8/2013,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A3Q6H2,...,17,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,False,nrthmntn_table-4-summary-of-watercourse-crossi...,1059614_17_lattice-v_1.csv,Field Site No Name Legal Location ...
2,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,Table,Application for North Montney Project,North Montney,11/8/2013,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A3Q6H2,...,18,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,False,nrthmntn_table-4-summary-of-watercourse-crossi...,1059614_18_lattice-v_1.csv,Field Site No Name Legal Location ...
3,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,Table,Application for North Montney Project,North Montney,11/8/2013,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A3Q6H2,...,19,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,False,nrthmntn_table-4-summary-of-watercourse-crossi...,1059614_19_lattice-v_1.csv,Field Site No Name Legal Location ...
4,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,Table,Application for North Montney Project,North Montney,11/8/2013,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A3Q6H2,...,20,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,False,nrthmntn_table-4-summary-of-watercourse-crossi...,1059614_20_lattice-v_1.csv,Field Site No Name Legal Location ...


In [24]:
esa_df['csvText'] = esa_df['csvText'].replace(' +', ' ', regex=True)

In [25]:
esa_df['csvText'].head()

0     Survey Date Aboriginal Communities Detail Jul...
1     Field Site No Name Legal Location UTM Coordin...
2     Field Site No Name Legal Location UTM Coordin...
3     Field Site No Name Legal Location UTM Coordin...
4     Field Site No Name Legal Location UTM Coordin...
Name: csvText, dtype: object

In [39]:
snorkel_df = esa_df[['Title', 'csvText']]
snorkel_df.rename(columns={"csvText": "text"}, inplace=True)
snorkel_df['label'] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snorkel_df['label'] = -1


In [40]:
snorkel_df["text"] = snorkel_df["Title"] + " " + snorkel_df["text"]
snorkel_df.drop(columns=["Title"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snorkel_df["text"] = snorkel_df["Title"] + " " + snorkel_df["text"]


In [41]:
snorkel_df.head()

Unnamed: 0,text,label
0,TABLE 3 SUMMARY OF AQUATICS FIELD WORK AND ABO...,-1
1,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,-1
2,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,-1
3,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,-1
4,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,-1


Here we create 4 dataset splits. What we call `train_ft_df` (Train Fine-Tune Set), `val_ft_df` (Validation Fine_tune Set), and `test_df` (Test Set) will contain labeled data. At the moment the examples are not labeled, but we will label them in Label Studio. The `train_df` is the unlabeled dataset we will be using Snorkel and a BERT model to label.

*   `fine_tune_df`: Labeled data we use for fine-tuning the bert model before putting model into a Snorkel Labeling Function.
*   `val_df`: This is our small dataset split we will use to measure the performance of the bert model trained with `fine_tune_df`.
*    `train_df`: As mentioned, the `train_df` is the unlabeled dataset we will be using Snorkel to label.
*    `test_df`: This is our dataset split for testing Snorkel's Label Model to get an estimate on how many labels were correctly labeled in our `train_df`.

In [45]:
from sklearn.model_selection import train_test_split
train_df, df_for_labeling = train_test_split(snorkel_df, test_size=1000, random_state=42, shuffle=True)

In [46]:
df_for_labeling

Unnamed: 0,text,label
8889,TABLE 8A.2 REASONABLY FORSEEABLE DEVELOPMENTS ...,-1
27753,TABLE 1 RESOURCE-SPECIFIC MITIGATION TABLE Lo...,-1
4165,TABLE D4 HOPE TO BURNABY OBSERVED PLANT SPECIE...,-1
22455,TABLE 3.2 CONSULTATION ACTIVITIES WITH PROVINC...,-1
9544,"Table 5.21-8: Major Proposed, Approved, and Re...",-1
...,...,...
14748,Table D-1 Soil Polygon Attributes � New Pipeli...,-1
16597,Table 6-11 ACIMS-Recorded Occurrences of Plant...,-1
3922,"TABLE 7.2.4-2 POTENTIAL EFFECTS, MITIGATION ME...",-1
13838,Table H1-9 Exposure Point Concentrations (mg/k...,-1


In [47]:
df_for_labeling.to_csv('./mydata/esa_index_for_labeling.csv')

We will now go into Label Studio and label 500 examples. We will use these labels to create weak labels with Snorkel. Essentially, we won't need to label the entire dataset manually if we use Snorkel. However, we will need to look at the least confident examples and potentially relabel any mistakes.

In [None]:
labeled_df = pd.read_csv('./mydata/esa_index_labeled.csv')

In [None]:
train_df, val_ft_df = train_test_split(train_df, test_size=100, random_state=42, shuffle=True)
train_df, test_df = train_test_split(train_df, test_size=500, random_state=42, shuffle=True)