In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [4]:
test_df.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


In [5]:
train_df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,9450e2bc3bf8cfcf,component composite coating,composition,H01
freq,1,152,24,2186


In [6]:
test_df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,b8ae62ea5e1d8bdb,el display,carrier polarity,G02
freq,1,2,1,3


The context is an important factor in this dataset, two words might seem unrelated at first. But they might turn out to be related based on the context. The context has specific codes related to the context. The data related to context has been collected and made available [here](https://www.kaggle.com/datasets/xhlulu/cpc-codes).

In [7]:
context_df = pd.read_csv("data/titles.csv")
context_df.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [11]:
train_df['context_desc'] = train_df['context'].map(context_df.set_index('code')['title']).str.lower()

In [23]:
import torch
from transformers import pipeline

path = 'distilbert-base-uncased-finetuned-sst-2-english'
classify = pipeline("sentiment-analysis", model=path, tokenizer=path)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [36]:
classify("Alas")

[{'label': 'NEGATIVE', 'score': 0.994047999382019}]

In [37]:
## TEXT GENERATION | Model Name: 'gpt2-medium' 
## Language: English | Model Architecture: GPT-2
## ⚙️Decoding Tutorial⚙️: https://huggingface.co/blog/how-to-generate

from transformers import pipeline

path = 'gpt2-medium'

generate = pipeline('text-generation', model=path, tokenizer=path)

#⚙️Decoding Parameters⚙️
max_length = 50
num_return_sequences = 1
clean_up = False
temperature = 0.7
top_p = 0.95
top_k = 60

text = 'There was an idea'

generate(text, 
         max_length=max_length, 
         num_return_sequences=num_return_sequences, 
         clean_up_tokenization_spaces=clean_up,
         temperature=temperature, 
         top_p=top_p, 
         top_k=top_k)

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'There was an idea in the back of my head that it was something that was impossible, but the moment I made it happen, I knew it was a good idea. I thought it would be really funny and amazing to tell a story about someone who'}]

In [46]:
generate("The little village is so peaceful to live, but one day", 
         max_length=100, 
         num_return_sequences=num_return_sequences, 
         clean_up_tokenization_spaces=clean_up,
         temperature=temperature, 
         top_p=top_p, 
         top_k=top_k)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The little village is so peaceful to live, but one day the town is attacked by orcs, and the only way to defend against them is to fight the orcs themselves. Now you can join the battle, and you can make your own way to the orc village, but if you don\'t you will end up being hunted down by the orc army."\n\n"I have no choice but to join the army. But I have no idea how the army works."\n\nThe sound of the sword'}]