In [14]:
#Access to Drive
from google.colab import drive

drive.mount ('/content/gdrive')

# to attempt to forcibly remount
# drive.mount("/content/gdrive", force_remount=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#Library installations
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#imports
import sys
import os

py_file_location = "/content/gdrive/MyDrive/SEO Schema.org/Code/"
sys.path.append(os.path.abspath(py_file_location))

import numpy as np
import pandas as pd
import spacy
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import BucketIterator

from sklearn.metrics import confusion_matrix



import clsLSTM
import ExportFeatureSet


In [None]:
spacy_en = spacy.load("en")

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]



TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
LABEL = LabelField(dtype = torch.long, use_vocab=False) 

fields = {"text": ("txt", TEXT), "label": ("lbl", LABEL)}
train_data, test_data = TabularDataset.splits(path="../DataSet/Dictionary/", 
                                              train="Strong_Train.csv", 
                                              test="Strong_Test.csv", format="csv", fields=fields)
TEXT.build_vocab(train_data, max_size=10000, min_freq=1)
LABEL.build_vocab(train_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#For LSTM
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=1, device=device, sort=False 
)

In [None]:
# Hyperparameters
input_size = len(TEXT.vocab)
hidden_size = 512
num_layers = 2
embedding_size = 100
learning_rate = 0.005
num_epochs =1
num_classes = 7 
# Initialize network
model = clsLSTM(input_size, embedding_size, hidden_size, num_layers,num_classes).to(device)
#print(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
_loss= {'train':[],'test':[]}
y_pred = [] #for cofusion matrix
y_true = [] #for cofusion matrix

# Train Network
for epoch in range(num_epochs):
  epoch_losses = list()
  for batch_idx, batch in enumerate(train_iterator):
    # Get data to cuda if possible
    data = batch.txt.to(device=device)
    
    targets = batch.lbl.to(device=device)

    # forward
    scores = model(data)

    loss = criterion(scores, targets)
    # loss = criterion(scores.squeeze(1), targets.type_as(scores))

    # backward
    optimizer.zero_grad()
    loss.backward()

    # gradient descent
    optimizer.step()

    epoch_losses.append(loss.item())

    
  _loss['train'].append(np.mean(epoch_losses))
  print('train loss on epoch {} : {:.3f}'.format(epoch, np.mean(epoch_losses)))


  test_losses = list()
  _totalPredictions=0
  _correctPredictions=0
  for batch in test_iterator:
    with torch.no_grad():
      optimizer.zero_grad()
      prediction = model(batch.txt)    
      _totalPredictions=_totalPredictions+1
      
      if(torch.argmax(prediction)== batch.lbl):
        _correctPredictions= _correctPredictions+1
      loss = criterion(prediction, batch.lbl)
      test_losses.append(loss.item())

      y_pred.append(int(prediction.argmax(dim=1)[0]))
      y_true.append(int(batch.lbl))


  _loss['test'].append(np.mean(test_losses))    
  print('test loss on epoch {}: {:.3f}'.format(epoch, np.mean(test_losses)))
  print('{} / {} = {:.3f}'.format(_correctPredictions, _totalPredictions, _correctPredictions/ _totalPredictions))
  

In [None]:
#Confusion Matrix

# label,entity
# 0,address
# 1,personName
# 2,email
# 3,contact
# 4,title
# 5,domain
# 6,qualification

confusion_matrix(y_true, y_pred, labels=[0,1,2,3,4,5,6])
