# NER

In [1]:
import torch
from torch import nn
import csv

In [2]:
import pandas as pd
df = pd.read_csv('NER dataset.csv',encoding='latin1')
train, test = df.values[:80000], df.values[80000:90000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
Word = list(set(df['Word']))
POS = list(set(df['POS']))
Tag = list(set(df['Tag']))

In [4]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, df,Word,POS,Tag):
        self.data = df.tolist()
        self.dataX = []
        self.dataY = []
        for i, d in enumerate(self.data):
            self.dataX.append([])
            self.dataY.append([])
            self.dataX[-1].append(Word.index(self.data[i][1]))
            for j in range(1, 3):
                if i - j > 0 and i + j < len(self.data):
                    self.dataX[-1].append(Word.index(self.data[i-j][1]))
                    self.dataX[-1].append(Word.index(self.data[i+j][1]))
                else:
                    self.dataX[-1].append(-1)
                    self.dataX[-1].append(-1)
                
            self.dataY[-1].append(POS.index(d[2]))
            self.dataY[-1].append(Tag.index(d[3]))
        self.dataX = torch.tensor(self.dataX)
        self.dataY = torch.tensor(self.dataY)
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        X = self.dataX[idx]
        Y = self.dataY[idx][0]
        Z = self.dataY[idx][1]

        return X, Y, Z

In [5]:
class Model(nn.Module):
    def __init__(self,input = 5, N_class: int = 1000):
        super().__init__()
        self.layer1 = nn.Linear(input, 10)
        self.relu = nn.ReLU(inplace=True)
        self.layer2 = nn.Linear(10, 20)
        self.layer3 = nn.Linear(20, 40)
        self.layer4 = nn.Linear(40, N_class)
        self.Softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.layer4(x)
        x = self.Softmax(x)
        return x

In [6]:
Word = list(set(df['Word']))
POS = list(set(df['POS']))
Tag = list(set(df['Tag']))
test_dataset = MyDataset(test,Word,POS,Tag)
train_dataset = MyDataset(train,Word,POS,Tag)

In [7]:
batch_size = 10
num_workers = 0

from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=True,)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=False)


In [16]:
lr = 1
model = Model(5, len(Tag))
import torch.optim as optim
optimizer =optim.AdamW(model.parameters(), lr = lr)
criterion=torch.nn.CrossEntropyLoss()

In [17]:
from tqdm import tqdm
num_epochs = 10
phases = ['T', 'V']
for epoch in range(num_epochs):
    for phase in phases:
        loss_ = 0
        if phase == 'T':
            model.train()  # Установить модель в режим обучения
            dataloader = train_dataloader
            # dataloader = val_dataloader
        else:
            model.eval()   #Установить модель в режим оценки
            dataloader = val_dataloader
        for data in tqdm(dataloader, total = len(dataloader)):
            inputs, P, labels = data
            inputs = inputs.to(torch.float32)

            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'T'):
                classification_out = model(inputs)
                loss = criterion(classification_out, labels)
                loss_ += loss.item()
                if phase == 'T':
                    loss.backward()
                    optimizer.step()
        print(f"Epoch: {epoch + 1}, phase: {phase}, loss: {loss_ / len(dataloader)}")


100%|██████████| 8000/8000 [00:13<00:00, 586.47it/s]


Epoch: 1, phase: T, loss: 2.1697379734516145


100%|██████████| 1000/1000 [00:00<00:00, 2075.88it/s]


Epoch: 1, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 571.18it/s]


Epoch: 2, phase: T, loss: 2.0809505575597287


100%|██████████| 1000/1000 [00:00<00:00, 2046.21it/s]


Epoch: 2, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 546.45it/s]


Epoch: 3, phase: T, loss: 2.080950557589531


100%|██████████| 1000/1000 [00:00<00:00, 2050.41it/s]


Epoch: 3, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 552.48it/s]


Epoch: 4, phase: T, loss: 2.0809505569934843


100%|██████████| 1000/1000 [00:00<00:00, 2084.09it/s]


Epoch: 4, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 547.15it/s]


Epoch: 5, phase: T, loss: 2.08095055770874


100%|██████████| 1000/1000 [00:00<00:00, 1958.32it/s]


Epoch: 5, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 555.72it/s]


Epoch: 6, phase: T, loss: 2.0809505578577516


100%|██████████| 1000/1000 [00:00<00:00, 2013.36it/s]


Epoch: 6, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 558.12it/s]


Epoch: 7, phase: T, loss: 2.080950556874275


100%|██████████| 1000/1000 [00:00<00:00, 1973.76it/s]


Epoch: 7, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 537.63it/s]


Epoch: 8, phase: T, loss: 2.0809505591094495


100%|██████████| 1000/1000 [00:00<00:00, 2058.83it/s]


Epoch: 8, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 557.34it/s]


Epoch: 9, phase: T, loss: 2.0809505566358566


100%|██████████| 1000/1000 [00:00<00:00, 2084.52it/s]


Epoch: 9, phase: V, loss: 2.0918005650043487


100%|██████████| 8000/8000 [00:14<00:00, 557.38it/s]


Epoch: 10, phase: T, loss: 2.0809505569636824


100%|██████████| 1000/1000 [00:00<00:00, 2071.59it/s]

Epoch: 10, phase: V, loss: 2.0918005650043487



