In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm.notebook import tqdm, trange
import numpy as np
import matplotlib.pyplot as plt

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = '/content/drive/MyDrive/Project/us-patent-phrase-to-phrase-matching'
train_path = data_dir + '/train_data.csv'
val_path = data_dir + '/val_data.csv'
test_path = data_dir + '/test_data.csv'

In [None]:
class PatentDataset(Dataset):
    def process_data(self, data_df):
        score_to_class = {0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1: 4}
        data_df['class'] = data_df.apply(lambda x: score_to_class[x['score']], axis=1)

        y = data_df['class'].to_numpy()
        data_df = data_df.drop(columns=['score', 'class'])
        #print(list(data_df.columns))
        anchor_cols = ['anchor_' + str(i) for i in range(768)]
        target_cols = ['target_' + str(i) for i in range(768)]
        anchor_data = data_df[anchor_cols].to_numpy()
        #print('anchor_data', anchor_data)
        #print('anchor_data', anchor_data.shape)
        target_data = data_df[target_cols].to_numpy()
        #print('target_data', target_data)
        #print('target_data', target_data.shape)
        aggr_data = np.multiply(anchor_data, target_data)
        #print('aggr_data', aggr_data)
        #print('aggr_data', aggr_data.shape)
        data_df = data_df.drop(columns=anchor_cols)
        data_df = data_df.drop(columns=target_cols)
        #print('data_df', data_df.shape)
        #print('data_df', list(data_df.columns))
        context_data = data_df.to_numpy()
        #print('context_data', context_data.shape)
        X = np.concatenate((aggr_data, context_data), axis=1)
        #print(X.shape, y.shape)
        return X, y
    def __init__(self, data_path):
        super(PatentDataset, self).__init__()

        data_df = pd.read_csv(data_path)
        X, y = self.process_data(data_df)

        self.X = torch.from_numpy(X).double()
        self.y = torch.from_numpy(y).long()
        print(self.X.dtype)
        print('X.shape', self.X.shape, 'y.shape', self.y.shape)
        self.len = X.shape[0]

    def __getitem__(self, index):
        if torch.is_tensor(index):
           index = index.tolist()
        return self.X[index], self.y[index] 

    def __len__(self):
        return self.len

In [None]:
train_set = PatentDataset(train_path)
val_set = PatentDataset(val_path)
test_set = PatentDataset(test_path)

In [None]:
train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=32, shuffle=False)
test_loader = DataLoader(dataset=test_set, batch_size=32, shuffle=False)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(874, 400) 
        self.fc2 = nn.Linear(400, 600)
        self.fc3 = nn.Linear(600, 800)
        self.fc4 = nn.Linear(800, 500)
        self.fc5 = nn.Linear(500, 200)  
        self.fc6 = nn.Linear(200,5)
        print(self.fc2.weight.dtype, self.fc2.bias.dtype, self.fc3.bias.dtype, self.fc4.bias.dtype, self.fc5.bias.dtype, self.fc6.bias.dtype)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

In [None]:
model = Model().double().to(device)
print(sum(p.numel() for p in model.parameters()))

In [None]:
def plot_loss(loss, title, xlabel, ylabel):
    epoch = list(range(len(loss)))
    plt.plot(epoch, loss)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epoch_count = 100
epoch_train_loss = []
epoch_val_loss = []
patience = np.Inf
patience_lost = 0
for epoch in range(1, epoch_count+1):
    print('Epoch', epoch)
    model.train()
    batch_train_loss = []
    for (X, y) in tqdm(train_loader, desc='Training epoch ' + str(epoch), leave=False):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        batch_train_loss.append(loss.item())
    epoch_train_loss.append(np.mean(batch_train_loss))
    print('Train loss: %.3f' % epoch_train_loss[-1], flush=True, end='')

    batch_val_loss = []
    model.eval()
    with torch.no_grad():
        for (X, y) in tqdm(val_loader, desc='Validation epoch', leave=False):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = criterion(pred, y)
            batch_val_loss.append(loss.item())
    epoch_val_loss.append(np.mean(batch_val_loss))
    print('Val loss: %.3f' % epoch_val_loss[-1], flush=True)
    if(epoch > 1):
        if(epoch_val_loss[-1] >= epoch_val_loss[-2]):
            patience_lost += 1
            if(patience_lost == patience):
                break
        else:
            patience_lost = 0        

In [None]:
plot_loss(epoch_train_loss, 'Train cross entropy loss across epochs', 'Epoch', 'Train CrossEntropyLoss')
plot_loss(epoch_val_loss, 'Validation cross entropy loss across epochs', 'Epoch', 'Val CrossEntropyLoss')

In [None]:
from scipy.special import softmax
from sklearn.metrics import classification_report

def calc_metrics(dataset, model):
    y_true = None
    y_pred = None
    with torch.no_grad():
        X, y = dataset.X.to(device), dataset.y.to(device)
        pred = model(X)
        y_true = y.cpu().detach().numpy()
        y_pred = pred.cpu().detach().numpy()
    y_pred = softmax(y_pred, axis=1)
    y_pred = np.argmax(y_pred, axis=1)
    report = classification_report(y_true, y_pred)
    print(report)

In [None]:
print('Train evaluation')
calc_metrics(train_set, model)
print('Validation evaluation')
calc_metrics(val_set, model)
print('Test evaluation')
calc_metrics(test_set, model)