In [1]:
# Load packages.
import os
import sys
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import BertModel, BertTokenizer, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup

import pandas as pd
import numpy as np
import json

import tqdm
from simpletransformers.language_modeling import LanguageModelingModel, LanguageModelingArgs





  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
data = pd.read_parquet("../data/Processed_records.parquet")
data = data.dropna()
data.head()


Unnamed: 0,features_properties_id,features_properties_title_en,features_properties_description_en,features_properties_keywords_en,metadata_en,metadata_en_processed,metadata_en_preprocessed_token
0,000183ed-8864-42f0-ae43-c4313a860720,"Principal Mineral Areas, Producing Mines, and ...",This dataset is produced and published annuall...,"mineralization, mineral occurrences, mines, hy...","Principal Mineral Areas, Producing Mines, and ...","principal mineral areas, producing mines, oil ...",principal mineral areas producing mines oil ga...
1,7f245e4d-76c2-4caa-951a-45d1d2051333,"Canadian Digital Elevation Model, 1945-2011",This collection is a legacy product that is no...,"Canada, Earth Sciences, elevation, relief, geo...","Canadian Digital Elevation Model, 1945-2011 Th...","canadian digital elevation model, 1945-2011 co...",canadian digital elevation model collection le...
2,085024ac-5a48-427a-a2ea-d62af73f2142,Canada's National Earthquake Scenario Catalogue,"The National Earthquake Scenario Catalogue, pr...","Emergency preparedness, Earth sciences, Earthq...",Canada's National Earthquake Scenario Catalogu...,canada's national earthquake scenario catalogu...,canada national earthquake scenario catalogue ...
3,03ccfb5c-a06e-43e3-80fd-09d4f8f69703,Temporal Series of the National Air Photo Libr...,"Note: To visualize the data in the viewer, zoo...","Mosaic, Aerial photography, Access to informat...",Temporal Series of the National Air Photo Libr...,temporal series national air photo library (na...,temporal series national air photo library nap...
4,488faf70-b50b-4749-ac1c-a1fd44e06f11,Indigenous Mining Agreements,The Indigenous Mining Agreements dataset provi...,"Indigenous, First Nations, Métis, Indigenous a...",Indigenous Mining Agreements The Indigenous Mi...,indigenous mining agreements indigenous mining...,indigenous mining agreements indigenous mining...


In [3]:
# 
df = pd.read_csv("df_training_full.csv")
df.head()

Unnamed: 0,features_properties_id,features_properties_title_en,metadata_en_processed
0,000183ed-8864-42f0-ae43-c4313a860720,"Principal Mineral Areas, Producing Mines, and ...","principal mineral areas, producing mines, oil ..."
1,7f245e4d-76c2-4caa-951a-45d1d2051333,"Canadian Digital Elevation Model, 1945-2011","canadian digital elevation model, 1945-2011 co..."
2,085024ac-5a48-427a-a2ea-d62af73f2142,Canada's National Earthquake Scenario Catalogue,canada's national earthquake scenario catalogu...
3,03ccfb5c-a06e-43e3-80fd-09d4f8f69703,Temporal Series of the National Air Photo Libr...,temporal series national air photo library (na...
4,488faf70-b50b-4749-ac1c-a1fd44e06f11,Indigenous Mining Agreements,indigenous mining agreements indigenous mining...


In [4]:
# Divide the dataset into training and test sets.
train_set = df.sample(frac=0.9, random_state=42)  # Fixing the seed to 42 to reproducibility.
test_set = df.drop(train_set.index)

In [5]:
# Create a dataframe from simpletransformers. To fine-tune a language model, each sample should be a row in a text file.
# Store the 'metadata_en_processed' column in a text file.
with open('../data/simpletransformer_lm_train.txt', 'w') as f:
    for item in train_set['metadata_en_processed']:
        f.write("%s\n" % item)

# Store the test set in a text file.
with open('../data/simpletransformer_lm_test.txt', 'w') as f:
    for item in test_set['metadata_en_processed']:
        f.write("%s\n" % item)



## Simpletransformers code for sanity check.

In [6]:
model_args = LanguageModelingArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 1
model_args.dataset_type = "simple"

In [7]:
# Print the model arguments.
print(json.dumps(model_args.__dict__, indent=2))
print(len(model_args.__dict__))

{
  "adafactor_beta1": null,
  "adafactor_clip_threshold": 1.0,
  "adafactor_decay_rate": -0.8,
  "adafactor_eps": [
    1e-30,
    0.001
  ],
  "adafactor_relative_step": true,
  "adafactor_scale_parameter": true,
  "adafactor_warmup_init": true,
  "adam_betas": [
    0.9,
    0.999
  ],
  "adam_epsilon": 1e-08,
  "best_model_dir": "outputs/best_model",
  "cache_dir": "cache_dir/",
  "config": {},
  "cosine_schedule_num_cycles": 0.5,
  "custom_layer_parameters": [],
  "custom_parameter_groups": [],
  "dataloader_num_workers": 0,
  "do_lower_case": false,
  "dynamic_quantize": false,
  "early_stopping_consider_epochs": false,
  "early_stopping_delta": 0,
  "early_stopping_metric": "eval_loss",
  "early_stopping_metric_minimize": true,
  "early_stopping_patience": 3,
  "encoding": null,
  "eval_batch_size": 8,
  "evaluate_during_training": false,
  "evaluate_during_training_silent": true,
  "evaluate_during_training_steps": 2000,
  "evaluate_during_training_verbose": false,
  "evaluate_

In [8]:
# Set path to the language modelling train file.
train_file = "../data/simpletransformer_lm_train.txt"
test_file = "../data/simpletransformer_lm_test.txt"

In [12]:
print(requests.__version__)

2.27.1


In [13]:
import requests
# Set path to SSL certificate.
os.environ['CURL_CA_BUNDLE'] = ''
requests.get('https://www.huggingface.co')
print(requests.__version__)

2.27.1




In [14]:
use_cuda = torch.cuda.is_available()
model = LanguageModelingModel(
    "bert", "bert-base-cased", args=model_args, use_cuda=use_cuda
)
print("Running on GPU: {}".format(use_cuda))

Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 23.8MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 2.43kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 44.0kB/s]
Downloading model.safetensors: 100%|██████████| 436M/436M [00:06<00:00, 66.0MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertFo

Running on GPU: True




In [15]:
# Fine tune.
model.train_model(train_file, eval_file=test_file)

result = model.eval_model(test_file)

100%|██████████| 6438/6438 [00:21<00:00, 302.09it/s]
100%|██████████| 9072/9072 [00:00<00:00, 93527.49it/s]
Epochs 0/1. Running Loss:    2.2163: 100%|██████████| 1134/1134 [02:18<00:00,  8.21it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [02:23<00:00, 143.04s/it]
100%|██████████| 715/715 [00:13<00:00, 52.80it/s]
100%|██████████| 1074/1074 [00:00<00:00, 63421.22it/s]
Running Evaluation: 100%|██████████| 135/135 [00:07<00:00, 17.81it/s]


In [16]:
result

{'eval_loss': 2.0003846917991286, 'perplexity': tensor(7.3919)}

## Pytorch / Huggingface fine-tuning code.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to calculate embeddings
def calculate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # we take the embedding of the [CLS] token
    return embeddings

# Calculate embeddings for each text
df['embeddings'] = df['metadata_en_processed'].apply(calculate_embedding)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(np.vstack(df['embeddings']))

# Find top 5 most similar texts for each text
df['top_5_similar'] = [list(df.iloc[np.argsort(-row)][1:6].index) for row in similarity_matrix]