In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [2]:
!wget  http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-04-17 05:40:50--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll.1’


2021-04-17 05:40:50 (10.0 MB/s) - ‘wnut17train.conll.1’ saved [493781/493781]



In [1]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    print(raw_text)
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [2]:
texts, tags = read_wnut('wnut17train.conll')

@paulwalk	O
It	O
's	O
the	O
view	O
from	O
where	O
I	O
'm	O
living	O
for	O
two	O
weeks	O
.	O
Empire	B-location
State	I-location
Building	I-location
=	O
ESB	B-location
.	O
Pretty	O
bad	O
storm	O
here	O
last	O
evening	O
.	O
	
From	O
Green	O
Newsfeed	O
:	O
AHFA	B-group
extends	O
deadline	O
for	O
Sage	O
Award	O
to	O
Nov	O
.	O
5	O
http://tinyurl.com/24agj38	O
	
Pxleyes	B-corporation
Top	O
50	O
Photography	O
Contest	O
Pictures	O
of	O
August	O
2010	O
...	O
http://bit.ly/bgCyZ0	O
#photography	O
	
today	O
is	O
my	O
last	O
day	O
at	O
the	O
office	O
.	O
	
4Dbling	B-person
's	O
place	O
til	O
monday	O
,	O
party	O
party	O
party	O
.	O
&lt;	O
3	O
	
watching	O
the	O
VMA	B-creative-work
pre-show	O
again	O
lol	O
it	O
was	O
n't	O
even	O
a	O
good	O
show	O
the	O
first	O
time	O
...	O
so	O
bored	O
!	O
	
27	O
followers	O
!	O
30	O
followers	O
is	O
my	O
goal	O
for	O
today	O
!	O
	
This	O
is	O
the	O
2nd	O
hospital	O
ive	O
been	O
in	O
today	O
,	O
but	O
ive	O
just	O
seen	O
a	O
doctor	O
who	O
was	O
an	O
older	O
versio

In [3]:
print(texts[0][10:17], tags[0][10:17], sep='\n')

['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']


In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [5]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [108]:
id2tag

{0: 'B-location',
 1: 'I-group',
 2: 'B-person',
 3: 'B-corporation',
 4: 'I-person',
 5: 'O',
 6: 'B-group',
 7: 'B-creative-work',
 8: 'I-corporation',
 9: 'I-product',
 10: 'I-location',
 11: 'B-product',
 12: 'I-creative-work'}

In [7]:
from transformers import AlbertTokenizerFast
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2)
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [8]:
val_encodings

{'input_ids': [[101, 16409, 2312, 4911, 112, 188, 1400, 1143, 1176, 119, 119, 8413, 131, 120, 120, 189, 119, 1884, 120, 150, 11185, 2036, 1183, 2087, 3048, 1183, 1527, 2101, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 128, 6929, 8618, 1107, 1103, 1314, 1989, 1105, 1175, 1209, 1129, 1167, 4911, 119, 20612, 1114, 8413, 131, 120, 120, 189, 119, 1884, 120, 180, 1643, 2924, 2107, 2924, 1477, 2137, 1306, 1942, 1643, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 137, 1142, 14082, 1960, 2390, 118, 155, 10583, 111, 1821, 1643, 132, 141, 10583, 17641, 1338, 1149, 1120, 1269, 1159, 118, 4534, 3234, 1144, 1654, 1104, 1343, 8916, 112, 188, 5741, 8050, 1112, 137, 15278, 2271, 13422, 122, 120, 123, 102, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

     
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [13]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [16]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [18]:
val_dataset.encodings

{'input_ids': [[101, 16409, 2312, 4911, 112, 188, 1400, 1143, 1176, 119, 119, 8413, 131, 120, 120, 189, 119, 1884, 120, 150, 11185, 2036, 1183, 2087, 3048, 1183, 1527, 2101, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 128, 6929, 8618, 1107, 1103, 1314, 1989, 1105, 1175, 1209, 1129, 1167, 4911, 119, 20612, 1114, 8413, 131, 120, 120, 189, 119, 1884, 120, 180, 1643, 2924, 2107, 2924, 1477, 2137, 1306, 1942, 1643, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 137, 1142, 14082, 1960, 2390, 118, 155, 10583, 111, 1821, 1643, 132, 141, 10583, 17641, 1338, 1149, 1120, 1269, 1159, 118, 4534, 3234, 1144, 1654, 1104, 1343, 8916, 112, 188, 5741, 8050, 1112, 137, 15278, 2271, 13422, 122, 120, 123, 102, 0, 0, 0, 0, 0, 0, 0, 0,

In [19]:
from transformers import AlbertForTokenClassification
model = AlbertForTokenClassification.from_pretrained('albert-base-v2',num_labels=len(unique_tags))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [20]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [67]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())




  0%|          | 0/170 [00:00<?, ?it/s][A[A[A


Epoch 0:   0%|          | 0/170 [00:04<?, ?it/s][A[A[A


Epoch 0:   0%|          | 0/170 [00:04<?, ?it/s, loss=0.0239][A[A[A


Epoch 0:   1%|          | 1/170 [00:04<14:02,  4.98s/it, loss=0.0239][A[A[A


Epoch 0:   1%|          | 1/170 [00:09<14:02,  4.98s/it, loss=0.0239][A[A[A


Epoch 0:   1%|          | 1/170 [00:09<14:02,  4.98s/it, loss=0.00201][A[A[A


Epoch 0:   1%|          | 2/170 [00:09<13:47,  4.93s/it, loss=0.00201][A[A[A


Epoch 0:   1%|          | 2/170 [00:14<13:47,  4.93s/it, loss=0.00201][A[A[A


Epoch 0:   1%|          | 2/170 [00:14<13:47,  4.93s/it, loss=0.00163][A[A[A


Epoch 0:   2%|▏         | 3/170 [00:14<13:31,  4.86s/it, loss=0.00163][A[A[A


Epoch 0:   2%|▏         | 3/170 [00:19<13:31,  4.86s/it, loss=0.00163][A[A[A


Epoch 0:   2%|▏         | 3/170 [00:19<13:31,  4.86s/it, loss=0.0144] [A[A[A


Epoch 0:   2%|▏         | 4/170 [00:19<13:16,  4.80s/it, loss=0.0144][A[A[

In [69]:
logits_train=torch.argmax(outputs.logits, dim=1)

In [109]:
texts = [tokenizer.decode(ids) for ids in  input_ids]

In [71]:
texts

['[CLS] @ sjonrefur Tuesday - Fight Night, Wednesday - MvC3 Fight Club in Chicago. Pringles, baby! Hope you get better by then! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] # ff @ eops to cheer him up, he's not feeling so hot today. ( and send him some virtual soup ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",
 "[CLS] @ GottaLaff don't do the crime if you can't serve the time. ; ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [41]:
from  torch import nn

In [113]:
model = model.to(device)
model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
pred_label = []
label_list=[]
for batch in val_loader:
    with torch.no_grad():
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)
         labels = batch['labels'].to(device)
         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits_label=torch.argmax(outputs.logits, dim=2)
    logits_label=logits_label.detach().cpu().numpy()
    label_ids=labels.to('cpu').numpy()   

In [88]:
logits_label.shape

(7, 13)

In [117]:
texts 

['[CLS] Happy New Year to all. May the year bring peace joy and love. May our societal advancement catch up to our technological ones. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 "[CLS] @ HackinTimSeeley A world without bare legs is a terrible place indeed! Here's to 6 + more years of bare legged cassie [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",
 '[CLS] Going to sta

In [168]:

for j in logits_label:
    for i in j:
        print(id2tag[i],end = " ")
    print()  

O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-person B-person O O O B-person B-person 
O O O O O O O O O O O O O O O O O O B-person O O O O O O O O O O O O O O O B-person B-person B-person B-person B-person O O O O O O O O O O O B-person B-person B-person B-person B-person B-person B-person O B-person O O O O O O O B-person B-person B-person B-person I-person O O O B-person O O O O O O O O B-person B-person O B-person O O O 
O O O O B-person I-person I-person I-person O O O O O O B-group I-group O O O O O O O O O O O O O B-group O B-group B-group I-group O O B-location O O O O O O O O O O O O O O O O O O O O O O O B-group O B-group B-group O O O O O O O O O O O O O B-group B-group B-group