In [1]:
import sys
sys.path.append('/thesis')

In [2]:
import json
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pkgutil

In [3]:
import nama
import torch
from nama.embedding_similarity import EmbeddingSimilarityModel
from nama.embedding_similarity import load_similarity_model

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
cuda = torch.device('cuda')

In [5]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
NVIDIA GeForce RTX 3060


In [6]:
df = pd.read_csv('/thesis/data/nama_embeddings_train.csv', sep='|', encoding='utf-8')
df = df.dropna()
df = df[df['target'].str.len() < 30]
df = df.sample(500000, random_state=1000)
df

Unnamed: 0,group,target
132628,q907743,szathmary eors
177308,q832328,gunther xl de schwarzbourg
602750,q1619336,nicoll scrub national park
320408,q8022668,wilma lee cooper
523379,q1353201,canadian rockies
...,...,...
161369,q3054600,enrica merlova
778674,q6476332,llyn janet
716900,q21711227,"al-dana, maarrat al-nu'man"
749463,q6673582,abhainn longai


In [7]:
training_data = nama.from_df(
    df,
    group_column='group',
    string_column='target')

In [8]:
sim = EmbeddingSimilarityModel()
sim.to(cuda)

In [9]:
train_kwargs = {
    'max_epochs': 10,
    'warmup_frac': 0.2,
    'transformer_lr': 1e-5,
    'score_lr': 30,
    'use_counts': False,
    'batch_size': 128,
    'early_stopping': False
}

history_df = sim.train(training_data, verbose=True, **train_kwargs)

Embedding strings: 100%|██████████| 475184/475184 [06:43<00:00, 1179.11it/s]
training epoch 0: 100%|██████████| 3713/3713 [48:48<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:46<00:00, 1169.40it/s]
training epoch 1: 100%|██████████| 3713/3713 [48:44<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:45<00:00, 1170.52it/s]
training epoch 2: 100%|██████████| 3713/3713 [48:46<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:46<00:00, 1169.54it/s]
training epoch 3: 100%|██████████| 3713/3713 [48:45<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:46<00:00, 1169.90it/s]
training epoch 4: 100%|██████████| 3713/3713 [48:44<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:46<00:00, 1169.98it/s]
training epoch 5: 100%|██████████| 3713/3713 [48:45<00:00,  1.27it/s]
Embedding strings: 100%|██████████| 475184/475184 [06:46<00:00, 1169.93it/s]
training epoch 6: 100%|██████████| 3713/3

In [10]:
sim.save('/thesis/models/nama_embeddings_lrg_10epochs')

In [11]:
history_df.to_csv('/thesis/models/nama_embeddings_lrg_10epochs_hist.csv', sep='|', index=False)