In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px
import bq_helper
from bq_helper import BigQueryHelper

import warnings
warnings.filterwarnings("ignore")

Building a model to match phrases in order to extract contextual information. In this competition, you will train your models on a novel semantic similarity dataset to extract relevant information by matching key phrases in patent documents. Determining the semantic similarity between phrases is critically important during the patent search and examination process to determine if an invention has been described before. For example, if one invention claims "television set" and a prior publication describes "TV set", a model would ideally recognize these are the same and assist.

In [None]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

Looking at the first 10 samples 

In [None]:
train_df.sample(10)

We have **Anchor** and **Target** , Anchor is the patent phrase and Target is phrase needed to match. The **Score** identifies how close they are to matching ranging from 0(not at all matching) to 1(identically matching) 

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum(axis = 0)

In [None]:
train_df[train_df.drop("id", axis =1).duplicated()]

Observations:
1. No missing values in the training data
2. No duplicate values in the training data

In [None]:
train_df.anchor.nunique()

In [None]:
train_df.anchor.value_counts().head(20)

In [None]:
pattern = 'base'
mask = train_df['target'].str.contains(pattern, case=False, na=False)
train_df.query("anchor == 'component composite coating'")[mask]

In [None]:
anchor_desc = train_df[train_df.anchor.notnull()].anchor.values
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 1000,
                     height = 600,
                     background_color = 'white',
                     min_font_size = 4,
                     stopwords = stopwords).generate(" ".join(anchor_desc))

plt.figure(figsize = (8,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
train_df['anchor_len'] = train_df['anchor'].str.split().str.len()
print(f"Anchors with a maximum length of 5: \n{(train_df.query('anchor_len == 5')['anchor'].unique())}")
print(f"\nAnchors with a maximum length of 4: \n{(train_df.query('anchor_len == 4')['anchor'].unique())}")

In [None]:
train_df.anchor_len.value_counts()

Observations:
1. Anchors have a maximum length of 5
2. Most anchors have 2 length

In [None]:
pattern = '[0-9]'
mask = train_df['anchor'].str.contains(pattern, na=False)
train_df['num_anchor'] = mask
train_df[mask]['anchor'].value_counts()

Observations:
1. Only 4 observations with numbers in it

In [None]:
train_df.target.nunique()

In [None]:
train_df.target.value_counts().head(20)

In [None]:
target_desc = train_df[train_df.target.notnull()].target.values
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 1000,
                     height = 600,
                     background_color = 'white',
                     min_font_size = 4,
                     stopwords = stopwords).generate(" ".join(target_desc))

plt.figure(figsize = (8,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
train_df['target_len'] = train_df['target'].str.split().str.len()
train_df.target_len.value_counts()

Observations:
1. The longest target character is 15 characters long

In [None]:
print(f"Target with a maximum length of 15: \n{(train_df.query('target_len == 15')['target'].unique())}")
print(f"\nTarget with a maximum length of 13: \n{(train_df.query('target_len == 13')['target'].unique())}")
print(f"\nTarget with a maximum length of 12: \n{(train_df.query('target_len == 12')['target'].unique())}")
print(f"\nTarget with a maximum length of 11: \n{(train_df.query('target_len == 11')['target'].unique())}")

In [None]:
pattern = '[0-9]'
mask = train_df['target'].str.contains(pattern, na=False)
train_df['num_target'] = mask
train_df[mask]['target'].value_counts()

Observations:
1. There's 112 observations which have numbers

## Model
We could represent the input to the model as something like "TEXT1: abatement; TEXT2: eliminating process". We'll need to add the context to this too. In Pandas, we just use + to concatenate,

In [None]:
train_df['input'] = 'TEXT1: ' + train_df.context + ' ;TEXT2: ' + train_df.target + ' ;ANC1: '+ train_df.anchor 

In [None]:
train_df.input.head()

## Tokenization
Transformers need Dataset object to store Dataset

In [None]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_pandas(train_df)

In [None]:
#How it is represented
ds

Obviously a deep learning model cannot take text as input. It takes numbers as input. So we need to do two things:
* Tokenization: Splitting each text into words/tokens
* Numericalization: Converting each token into numbers

In [None]:
model_nm = 'microsoft/deberta-v3-small'

In [None]:
#Autotokenizer will create a tokenizer appropriate for the model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

In [None]:
#Examples of how the token is working
tokz.tokenize("Good day everyone! Let's go")

Observations:
1. Tokens begin with an underscore
2. Unique words are divided into parts

In [None]:
def tok_func(x): return tokz(x['input'])

In [None]:
#Running the token function on all of our datasets using the map function
tok_ds = ds.map(tok_func, batched =True)

This adds a new row index_ids to our dataset. Let's look at the first index_id of our first text

In [None]:
row = tok_ds[0]
row['input'], row['input_ids']

We have to rename the score column because the Transformers deals with labels column

In [None]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [None]:
def corr(x,y): return np.corrcoef(x,y)[0][1]

## Creating validation sets

In [None]:
test_df.describe()

In [None]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

In [None]:
#Since the train test split has named the validation dataset as Test, we'll have to keep that in mind 
test_df['input'] = 'TEXT1: '+ test_df.context +' ; TEXT2: '+ test_df.target + ' ;ANC1: '+ test_df.anchor
eval_ds = Dataset.from_pandas(test_df).map(tok_func, batched=True)
#Naming the dataset eval to avoid confusion

In [None]:
def corr(x,y): return np.corrcoef(x,y)[0][1]

In [None]:
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## Training the model
We'll be needing this to train our model

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
bs = 63 #batch size
epochs = 7 

In [None]:
lr = 4e-6 #learning rate 

In [None]:
args = TrainingArguments('outputs',learning_rate = lr, warmup_ratio = 0.1, lr_scheduler_type='cosine',fp16 =True,
                        evaluation_strategy = 'epoch',per_device_train_batch_size = bs, per_device_eval_batch_size = bs*2,
                        num_train_epochs = epochs, weight_decay = 0.01, report_to = 'none')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset = dds['train'],eval_dataset = dds['test'],
                 tokenizer = tokz, compute_metrics = corr_d)

In [None]:
trainer.train();

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

Removing predictions less than 0 and greater than 1

In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
preds

In [None]:
import datasets
submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})
submission.to_csv('submission.csv', index=False)