In [1]:
import pandas as pd
from termcolor import colored
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import bq_helper
from bq_helper import BigQueryHelper
import string
import pandas as pd
from nltk.stem.snowball import SnowballStemmer

import warnings
warnings.filterwarnings("ignore")

seed = 1996

In [2]:
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [3]:
print(f"Number of observations in TRAIN: {colored(train_df.shape, 'yellow')}")
print(f"Number of observations in TEST: {colored(test_df.shape, 'yellow')}")

Number of observations in TRAIN: [33m(36473, 5)[0m
Number of observations in TEST: [33m(36, 4)[0m


In [4]:
train_df['score'].unique()

array([0.5 , 0.75, 0.25, 0.  , 1.  ])

In [5]:
train_df.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [6]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


# 1. Data cleaning

In [7]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def remove_stopwords(text):
    temp = ''
    for w in text.split():
        if w in STOPWORDS: continue
        temp += ' ' + w
    return temp

def stemming(text):
    temp = ''
    for w in text.split():
        temp += ' ' + stemmer.stem(w)
    return temp

* Train data

In [8]:
train_df['target1'] =  train_df.target.apply(remove_punctuations)
train_df['target1'] =  train_df.target1.apply(remove_stopwords)
train_df['target1'] =  train_df.target1.apply(stemming)

train_df['anchor1'] =  train_df.anchor.apply(remove_punctuations)
train_df['anchor1'] =  train_df.anchor1.apply(remove_stopwords)
train_df['anchor1'] =  train_df.anchor1.apply(stemming)

train_df

Unnamed: 0,id,anchor,target,context,score,target1,anchor1
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,abat pollut,abat
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,act abat,abat
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,activ catalyst,abat
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,elimin process,abat
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,forest region,abat
...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00,wooden articl,wood articl
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50,wooden box,wood articl
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50,wooden handl,wood articl
36471,756ec035e694722b,wood article,wooden material,B44,0.75,wooden materi,wood articl


* CPC data

In [9]:
titles = pd.read_csv('../input/cpc-codes/titles.csv')

In [10]:
pd.set_option('display.max_colwidth', None)
titles['title2'] = titles.title.apply(remove_punctuations)
titles['title2'] = titles.title2.str.lower()
titles['title2'] = titles.title2.apply(remove_stopwords)
titles['title2'] = titles.title2.apply(stemming)


titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group,title2
0,A,HUMAN NECESSITIES,A,,,,,human necess
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING,A,1.0,,,,agricultur forestri anim husbandri hunt trap fish
2,A01B,"SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})",A,1.0,B,,,soil work agricultur forestri part detail accessori agricultur machin implement general make cover furrow hole sow plant manur a01c500 soil work engin purpos e01 e02 e21 measur area agricultur purpos g01b
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 {; machines for working soil A01B35/00; making hand tools B21D}),A,1.0,B,1.0,0.0,hand tool edg trimmer lawn a01g306 machin work soil a01b3500 make hand tool b21d
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3/02)},A,1.0,B,1.0,2.0,spade shovel handoper dredger e02f302


* merge training and cpc data

In [11]:
train_df1 = train_df.merge(titles, left_on='context', right_on='code')
train_df2 = train_df1.drop(['subclass','group','main_group', 'id','title', 'target', 'anchor', 'context'],axis = 1)
train_df2['class'] = train_df1['class'].astype(int)
train_df2

Unnamed: 0,score,target1,anchor1,code,section,class,title2
0,0.50,abat pollut,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
1,0.75,act abat,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
2,0.25,activ catalyst,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
3,0.50,elimin process,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
4,0.00,forest region,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
...,...,...,...,...,...,...,...
36468,0.00,undulatori swimmer,undul,B31,B,31,make articl paper cardboard materi work manner analog paper work paper cardboard materi work manner analog paper
36469,0.00,voltag fluctuat,undul,B31,B,31,make articl paper cardboard materi work manner analog paper work paper cardboard materi work manner analog paper
36470,0.75,transfer web,web transfer,B31,B,31,make articl paper cardboard materi work manner analog paper work paper cardboard materi work manner analog paper
36471,0.25,transfer web,web transfer,B31,B,31,make articl paper cardboard materi work manner analog paper work paper cardboard materi work manner analog paper


# 2. Look into each category

* Extract each category

In [12]:
train_df = train_df2
train_00 = train_df.query('score==0.0')
train_25 = train_df.query('score==0.25')
train_50 = train_df.query('score==0.50')
train_75 = train_df.query('score==0.75')
train_100 = train_df.query('score==1.0')

print(len(train_00),len(train_25),len(train_50),len(train_75),len(train_100))

7471 11519 12300 4029 1154


* Augment 0.75 cases

In [13]:
Num_switch = 4000
temp = train_75.sample(Num_switch, random_state = seed)
temp.head()

Unnamed: 0,score,target1,anchor1,code,section,class,title2
15657,0.75,time seri graph,board id,G01,G,1,measur test
21940,0.75,work height,oper height,B61,B,61,railway
29208,0.75,collat apparatus,collat,G11,G,11,inform storag
20987,0.75,light thick,length light,G03,G,3,photographi cinematographi analog techniqu use wave optic wave electrographi holographi
9413,0.75,unexpect posit,abnorm posit,D03,D,3,weav


In [14]:
temp['anchor1'], temp['target1'] = temp['target1'].copy(), temp['anchor1'].copy()

temp.head()

Unnamed: 0,score,target1,anchor1,code,section,class,title2
15657,0.75,board id,time seri graph,G01,G,1,measur test
21940,0.75,oper height,work height,B61,B,61,railway
29208,0.75,collat,collat apparatus,G11,G,11,inform storag
20987,0.75,length light,light thick,G03,G,3,photographi cinematographi analog techniqu use wave optic wave electrographi holographi
9413,0.75,abnorm posit,unexpect posit,D03,D,3,weav


* Merge data

In [15]:
train_df['anchor1'].value_counts()

 compon composit coat              152
 sheet suppli roller               150
 sourc voltag                      140
 perfluoroalkyl group              136
 el display                        135
                                  ... 
 shannon                             2
 dri coat composition1               2
 plug nozzl                          2
 conduct conduct materi              1
 peripher nervous system stimul      1
Name: anchor1, Length: 733, dtype: int64

In [16]:
#append 0.75 data
train = train_df.append(temp)
train.score.value_counts()
train['anchor1'].value_counts()

 compon composit coat            152
 sheet suppli roller             150
 sourc voltag                    141
 perfluoroalkyl group            136
 el display                      135
                                ... 
 photograph light sensit film      1
 pulp cook                         1
 content analysi test              1
 inner support                     1
 sphygmomanomet devic              1
Name: anchor1, Length: 4056, dtype: int64

* form input

In [17]:
sections = {"A" : "Human Necessities", 
            "B" : "Operations and Transport",
            "C" : "Chemistry and Metallurgy",
            "D" : "Textiles",
            "E" : "Fixed Constructions",
            "F" : "Mechanical Engineering",
            "G" : "Physics",
            "H" : "Electricity",
            "Y" : "Emerging Cross-Sectional Technologies"}

In [18]:
train

Unnamed: 0,score,target1,anchor1,code,section,class,title2
0,0.50,abat pollut,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
1,0.75,act abat,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
2,0.25,activ catalyst,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
3,0.50,elimin process,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
4,0.00,forest region,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general
...,...,...,...,...,...,...,...
29131,0.75,fire cartridg,fire cartridg case,C06,C,6,explos match
6397,0.75,determin analysi,calcul analysi,H04,H,4,electr communic techniqu
34935,0.75,grip layer,grip surfac,D05,D,5,sew embroid tuft
32483,0.75,sphygmomanomet,sphygmomanomet devic,G05,G,5,control regul


In [19]:
train['topic'] = train['section'].map(sections).str.lower()

train['input'] = train['anchor1'] +' '+ train['section'].str.lower() + " " + train['class'].astype(str) + " " + train['topic'] + ' ' +\
                    train['title2'].str.lower() 
train

Unnamed: 0,score,target1,anchor1,code,section,class,title2,topic,input
0,0.50,abat pollut,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general,human necessities,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general
1,0.75,act abat,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general,human necessities,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general
2,0.25,activ catalyst,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general,human necessities,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general
3,0.50,elimin process,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general,human necessities,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general
4,0.00,forest region,abat,A47,A,47,furnitur domest articl applianc coffe mill spice mill suction cleaner general,human necessities,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general
...,...,...,...,...,...,...,...,...,...
29131,0.75,fire cartridg,fire cartridg case,C06,C,6,explos match,chemistry and metallurgy,fire cartridg case c 6 chemistry and metallurgy explos match
6397,0.75,determin analysi,calcul analysi,H04,H,4,electr communic techniqu,electricity,calcul analysi h 4 electricity electr communic techniqu
34935,0.75,grip layer,grip surfac,D05,D,5,sew embroid tuft,textiles,grip surfac d 5 textiles sew embroid tuft
32483,0.75,sphygmomanomet,sphygmomanomet devic,G05,G,5,control regul,physics,sphygmomanomet devic g 5 physics control regul


# Save training data

In [20]:
train_data = train[['input','target1','score']]
train_data = train_data.rename(columns={'target1':"target"})
train_data.head()

Unnamed: 0,input,target,score
0,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,abat pollut,0.5
1,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,act abat,0.75
2,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,activ catalyst,0.25
3,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,elimin process,0.5
4,abat a 47 human necessities furnitur domest articl applianc coffe mill spice mill suction cleaner general,forest region,0.0


In [21]:
train_data.to_csv('uspppm_train.csv', index=False)