<a href="https://colab.research.google.com/github/GabeAspir/Patent-Prior-Art-Finder/blob/main/4_SimilarityTechniques/Getting_into_metrics_Ephraim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text methodologies
* Importing new data
* Tokenizing



In [21]:
import pandas
from google.colab import files
from google.colab import drive
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from numpy import dot
from numpy.linalg import norm
import io
import re
import typing

# I added '' to the stopwords to avoid the case where a short first word turns into an empty list item
stop_words= {'','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"}

In [3]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
pt = pandas.read_csv(io.BytesIO(uploaded['tenmatch.csv']))

Saving tenmatch.csv to tenmatch (1).csv
User uploaded file "tenmatch.csv" with length 492072 bytes


#Tokenization

In [4]:
def tokenize(input):
  out = input.lower()
  out= re.sub(r'\b\w{1,2}\b','',out) #remove anything not a word of length 2+
  out= re.sub(r"[0-9]+","_NUM_",out) #substitute _NUM_ for any block of consecutive number chars
  words= re.split('\W+', out) #Might need to change to pandas split at some point
                            # Note capital W is "Not word"= [a-zA-Z0-9_]
  words= list(filter(lambda s: s not in stop_words,words)) # why list not set?  ¯\_(ツ)_/¯
  return words

In [5]:
def pTokenize(input): #Now bec pandas series are slightly different, and apparantly not itterating over the lists as strings is better (see: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas) 
  out = input.str.lower()
  out= out.str.replace(r"[0-9]+","_NUM_") #substitute _NUM_ for any block of consecutive number chars
  out= out.str.replace(r'\b\w{1,2}\b',"") #remove anything not a word of length 2+
  out= out.str.split('\W+') # Note capital W is "Not word"= [a-zA-Z0-9_]
  r= [s for s in r if not s in stop_words]
  return out

In [6]:
def bow(series,corpus =None): # Takes a tokenized series
  if corpus is None:
    corpus = getCorpus(series)
  counts=[]
  for r in series:
    count={}
    for w in corpus:
      count[w]=r.count(w)
    counts.append(count)
  return counts
def getCorpus(series):
  corpus=set()
  for r in series:
    corpus.update(r)
  #print("corpus: "+str(corpus) )
  return corpus

It seems like for Jaccard it will be way simpler to use a binary BOW, so lets set that up.

In [8]:
def bbow(input,corpus =None): #Binary bag of words 
  if corpus is None:
    corpus = getCorpus(input)
  output=[]
  for r in input:
    out= [s in r for s in corpus]
    output.append(out)
  return output 

In [9]:
def func_Jaccard(thingOne,thingTwo): # Binary Jaccard index of 2 lists
  #output=[one is two for one,two in zip(thingOne,thingTwo)]
  output=0
  for one,two in zip(thingOne,thingTwo):
    if one is two:
      output +=1
  output= output / len(thingOne)
  return output
textA= [True, True, False]
textB= [False, True, True]
print(func_Jaccard(textA,textB))
textC= [True,True,True,True]
textD=[False,False,False,False]
print(func_Jaccard(textD,textC))

0.3333333333333333
0.0


In [10]:
def bJac(input_series,index_series): # Binary Jaccard index for a Series against itself
  fintable=pandas.DataFrame(index_series)
  for r,name in zip(input_series,index_series):
    mysim=[]
    for rr in input_series:
      mysim.append(func_Jaccard(r,rr))
    fintable[name]=mysim
  return fintable

In [11]:
def prep(frame):
  frame['tokenized']=frame['Publication_Number']
  for index, row in pt.iterrows():
    frame['tokenized'][index]= tokenize(frame['Abstract'][index])
  frame['Bagged']= bow(frame['tokenized'])
  temp_sim_Table=  bJac(bbow(frame['tokenized']),frame['Publication_Number'])
  return temp_sim_Table

Now Lets try that again with a random sample

In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [12]:
sim_Table = prep(pt)

# SciKit
ok now we'll actually have to use sciKit

In [13]:
def bCos(input_series,index_series): # Binary cosine index for a Series against itself
  fintable=pandas.DataFrame(index_series)
  for a,name in zip(input_series,index_series):
    mysim=[]
    for b in input_series:
      sim=dot(a, b)/(norm(a)*norm(b))
      mysim.append(sim)
      #mysim.append(cosine_similarity(r)) #<< Once again, didn't get input to work
    fintable[name]=mysim
  return fintable

In [15]:
temp_sim_Table=  bCos(bbow(pt['tokenized']),pt['Publication_Number'])

In [16]:
temp_sim_Table

Unnamed: 0,Publication_Number,US-9428454-B2,US-9437520-B2,US-9447236-B2,US-9449736-B2,US-2008199533-A1,US-PP25006-P3,US-2001032345-A1,US-2003195014-A1,US-2003009852-A1,US-2005155182-A1
0,US-9428454-B2,0.022222,0.034199,0.025565,0.0,0.017568,0.0,0.019087,0.024183,0.024183,0.0
1,US-9437520-B2,0.034199,0.052632,0.039344,0.051299,0.027037,0.0,0.029374,0.037216,0.037216,0.043355
2,US-9447236-B2,0.025565,0.039344,0.029412,0.038348,0.020211,0.0,0.021958,0.027821,0.027821,0.03241
3,US-9449736-B2,0.0,0.051299,0.038348,0.05,0.0,0.0,0.0,0.036274,0.0,0.042258
4,US-2008199533-A1,0.017568,0.027037,0.020211,0.0,0.013889,0.023113,0.015089,0.019118,0.0,0.0
5,US-PP25006-P3,0.0,0.0,0.0,0.0,0.023113,0.038462,0.02511,0.0,0.0,0.0
6,US-2001032345-A1,0.019087,0.029374,0.021958,0.0,0.015089,0.02511,0.016393,0.02077,0.02077,0.0
7,US-2003195014-A1,0.024183,0.037216,0.027821,0.036274,0.019118,0.0,0.02077,0.026316,0.026316,0.030657
8,US-2003009852-A1,0.024183,0.037216,0.027821,0.0,0.0,0.0,0.02077,0.026316,0.026316,0.030657
9,US-2005155182-A1,0.0,0.043355,0.03241,0.042258,0.0,0.0,0.0,0.030657,0.030657,0.035714


#SciKit Cosine (Actually Working)

In [17]:
scicos= cosine_similarity(bbow(pt['tokenized']))

In [19]:
costb= pandas.DataFrame(scicos)
costb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.034199,0.076696,0.0,0.052705,0.0,0.019087,0.024183,0.024183,0.0
1,0.034199,1.0,0.039344,0.051299,0.027037,0.0,0.058747,0.074432,0.074432,0.130066
2,0.076696,0.039344,1.0,0.115045,0.060634,0.0,0.087833,0.027821,0.027821,0.03241
3,0.0,0.051299,0.115045,1.0,0.0,0.0,0.0,0.036274,0.0,0.042258
4,0.052705,0.027037,0.060634,0.0,1.0,0.069338,0.105625,0.038236,0.0,0.0
5,0.0,0.0,0.0,0.0,0.069338,1.0,0.07533,0.0,0.0,0.0
6,0.019087,0.058747,0.087833,0.0,0.105625,0.07533,1.0,0.02077,0.041541,0.0
7,0.024183,0.074432,0.027821,0.036274,0.038236,0.0,0.02077,1.0,0.052632,0.153285
8,0.024183,0.074432,0.027821,0.0,0.0,0.0,0.041541,0.052632,1.0,0.030657
9,0.0,0.130066,0.03241,0.042258,0.0,0.0,0.0,0.153285,0.030657,1.0


# SciKit Jaccard (In-progress)

In [29]:
myBag= bbow(pt['tokenized']) # Better late than never
jacsci = pandas.DataFrame(jaccard_score(myBag,myBag, average=None))

In [30]:
jacsci

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
330,1.0
331,1.0
332,1.0
333,1.0
