In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from scipy import stats
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

!pip install transformers
!pip install sentencepiece
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification

[0m

In [3]:
model_name = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained('../input/patentmatching', use_fast=True)

In [4]:
df_test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [5]:
context_mapping_df = pd.read_csv('../input/patentmatching/titles.csv')
context_mapping = {}
for code, context in zip(context_mapping_df['code'], context_mapping_df['title']):
    context_mapping[code] = context

context_title_mapping = {"A" : "Human Necessities", 
      "B" : "Operations and Transport",
      "C" : "Chemistry and Metallurgy",
      "D" : "Textiles",
      "E" : "Fixed Constructions",
      "F" : "Mechanical Engineering",
      "G" : "Physics",
      "H" : "Electricity",
      "Y" : "Emerging Cross-Sectional Technologies"}

df_test['context_text'] = df_test['context'].apply(lambda x: context_mapping[x].lower())
df_test['context_title'] = df_test['context'].apply(lambda x: context_title_mapping[x[0]].lower())

df_test['text'] = df_test['anchor'] + '[SEP]' + df_test['target'] + '[SEP]' + df_test['context_text']

In [6]:
label_mapping = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}
reverse_label_mapping = {}
for k, v in label_mapping.items():
    reverse_label_mapping[v] = k

In [7]:
class TestPatentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df['text'].values.tolist()
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        inputs = tokenizer(self.texts[idx], padding='max_length', max_length = self.max_len, truncation=True, return_tensors="pt")
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)
        return inputs

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 16
max_len = 100

In [9]:
test_dataset = TestPatentDataset(df_test, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [10]:
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained('../input/patentmatching')
        self.fc_dropout = nn.Dropout()
        self.fc = nn.Linear(768, 5)
        self.attention = nn.Sequential(
            nn.Linear(768, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        output = self.fc(self.fc_dropout(feature))
        return output

In [11]:
model = CustomModel(model_name)
model.load_state_dict(torch.load(f'../input/patentmatching/patent_model_2.pth'))
model = model.to(device)

In [12]:
model.eval()
test_preds = []
for inputs in tqdm(test_loader):
    with torch.no_grad():
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        y_preds = model(inputs)
        test_preds.append(y_preds.argmax(dim=-1).to('cpu').detach().numpy())
predictions = np.concatenate(test_preds)

100%|██████████| 3/3 [00:01<00:00,  2.25it/s]


In [13]:
submission_score = [reverse_label_mapping[i] for i in predictions]
submission_df = df_test[['id']]
submission_df['score'] = submission_score
submission_df.to_csv('./submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
