In [20]:
import scipy.spatial.distance
import pandas as pd
import numpy as np
import re
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim import models
from gensim.corpora import Dictionary
from timeit import default_timer as timer

import nltk.downloader
nltk.download('stopwords')


class _DevFilesPatentPriorArtFinder:
    def __init__(self, dirPath, publicationNumberColumnString='Publication_Number', comparisonColumnString='Abstract', cit_col= "Citations"):
        startTime = timer()
        self.corpus = []
        self.number_of_patents_with_word = {}
        self.plain_dataframe = None
        self.dataframe = None
        self.word_count_matrix = None
        self.model_words = None
        self.model_citations = None
        self.tfidf_gensim = models.TfidfModel()
        self.dictionary = Dictionary()
        self.dirPath= dirPath
        if dirPath is None:
            raise IOError('The passed file path was empty')
        self.id_col = publicationNumberColumnString
        self.txt_col = comparisonColumnString
        self.cit_col = cit_col

        # Create the folders for metadata files, and will pass should an error thown when the directory exists from a previous object
        try:
            os.mkdir(dirPath+"\meta")
            os.mkdir(dirPath+"\w2v")
            os.mkdir(dirPath+"\other")
        except:
            print("Didn't make directories")
            pass
        print("Initialization complete T="+str(timer()))

    def train(self):
        # Iterates over the files in the directory twice.
        # Once to save the tokenization column to the file, and adds the file to the model's training
        # 2nd time to append the w2v encodings generated from the fully trained model to the files.
        print("Training has begun")
        first=True
        for entry in os.scandir(self.dirPath):
            if entry.is_file():    # To avoid entering the directories
                print("tokenizing "+ str((entry)))
                self._makeModel(entry,first)
        print("Tokenization Completed T="+str(timer()))

        for entry in os.scandir(self.dirPath):
            if entry.is_file():
                print("getting embedding of "+str(entry)+" T="+str(timer()))
                self._makeEmbeddings(entry)
        print("Embeddings completed"+str(timer()))
        self.dictionary.save_as_text(self.dirPath + "\other\\dict.txt")

    def is_gz_file(self, filepath):
        with open(filepath, 'rb') as test_f:
            return test_f.read(2) == b'\x1f\x8b'

    # Private methods for train to call
    def _makeModel(self,file, first):
        try:
            dataframe= pd.io.json.read_json(file,compression="gzip")
        except:
            #print('here before dataframe')
            dataframe= pd.io.json.read_json(file,compression="gzip",lines=True)
            #dataframe= pd.DataFrame.from_records(file)
            #print(dataframe)
            #print('here after dataframe')
        dataframe['Tokens'] = dataframe[self.txt_col].apply(self._tokenizeText)
        dataframe['TokenizedCitations'] = dataframe['Citations'].apply(self._tokenizeCitation)
        # words = 0
        # for index,doc in dataframe.iterrows():
        #     words += len(doc["Tokens"])
        # print(str(file)+" has "+ str(words) +" word tokens")
        self.dictionary.add_documents(dataframe['Tokens'])

        print("Writing "+str(file))
        dataframe.to_json(self.get(file,"meta"), orient='records', indent=4)

        if first:
            first= False
            model_words = Word2Vec(dataframe['Tokens'])
            self.model_words = model_words
            model_citations = Word2Vec(dataframe['TokenizedCitations'], min_count=1)
            self.model_citations = model_citations
        else:
            self.model_words.build_vocab(dataframe["Tokens"], update=True)
            self.model_words.train(dataframe["Tokens"], total_examples=self.model_citations.corpus_count,
                                   epochs=self.model_words.epochs)
            self.model_words.build_vocab(dataframe["TokenizedCitations"], update=True)
            self.model_citations.train(dataframe['TokenizedCitations'],
                                       total_examples=self.model_citations.corpus_count,
                                       epochs=self.model_citations.epochs)

    def _tokenizeCitation(self, string):
        no_commas = string.replace(',',' ')
        tokenized = word_tokenize(no_commas)
        finished = []
        for token in tokenized:
            str = self._removeSuffix(token)
            finished.append(str)
        return list(set(finished))

    def _removeSuffix(self, string):
        tokens = string.split('-')
        # Should always have at least a prefix and the patent, this will take away the suffix or keep it the same
        return str(tokens[0] + '-' + tokens[1])

    # Will add column to dataframe called 'Tokens'
    def _tokenizeText(self, string):
        #prepares the string for tokenization, to lowercase, then removes punctutation, then changes numbers to _NUM_
        string = string.lower()
        string = re.sub(r"\d+\.?\d*", " _NUM_ ", string)
        string = re.sub(r'[^\w\s]', '',string)
        stop_words = set(stopwords.words("english"))
        tokenized = word_tokenize(string)
        return [word for word in tokenized if not word.lower() in stop_words]


    def _makeEmbeddings(self, file):
        try:
            dataframe= pd.io.json.read_json(self.get(file,"meta"), orient = 'records', lines=True)
        except:
            dataframe= pd.io.json.read_json(self.get(file,"meta"), orient = 'records')

        corpus = [self.dictionary.doc2bow(line) for line in dataframe['Tokens']]
        self.tfidf_gensim = models.TfidfModel(corpus)
        dataframe["TF-IDF"] = [self.tfidf_gensim[corpus[x]] for x in range(0, len(corpus))]
        print(dataframe["TF-IDF"])
        dataframe.to_json(self.get(file,"meta"), orient = 'records', indent=4)
        vecs =[]
        for (tokenList, citationList, tfidfList) in zip(dataframe['Tokens'], dataframe['TokenizedCitations'], dataframe["TF-IDF"]):
            sum_words = np.empty(50)
            sum_citations = np.empty(50)
            sum_tfidf = np.empty(50)
            tfidfDict = dict(tfidfList)
            # Create a sum of the words in a given document to create a doc vector
            # Maintain 2 such vectors: 1 plain, and another where each word vector is multiplied by the word's tfidf weight
            for word in tokenList:
                index = self.dictionary.token2id.get(word)
                tfidfValue = tfidfDict.get(index)
                try:
                    sum_words= np.add(sum_words ,self.model_words.wv[word])
                    sum_tfidf= np.add(sum_tfidf,np.multiply(tfidfValue,self.model_words[word]))
                except: # In case the model does not have a given word, ignore it.
                    pass

            for citation in citationList:
                try:
                    np.add(sum_citations,self.model_citations.wv[citation])
                except:
                    pass
            sum = np.concatenate((sum_words,sum_citations))
            vecs.append(sum)
        vec_frame =  pd.DataFrame(dataframe[self.id_col])
        vec_frame['Word2Vec'] = vecs
        vec_frame.to_json(self.get(file,"w2v"), orient = 'records', indent=4)


    @staticmethod
    def get(entry, folder):
        head, tail = os.path.split(entry.path)
        return head + "\\"+folder+"\\" + tail

    # Comparing new patent based on TF-IDF/Cosine Similarity
    # dataframe must have TF-IDF column
    def compareNewPatent(self, newPatentSeries, dirPath, threshold):
        newPatentSeries['Tokens'] = self._tokenizeText(string=newPatentSeries['Abstract'])
        newPatentSeries['TokenizedCitations']= self._tokenizeCitation(string=newPatentSeries['Citations'])
        sum_words = np.empty(50)
        sum_citations = np.empty(50)
        sum_tfidf = np.empty(50)

        txtFile = dirPath + "\other\dict.txt"
        dct = Dictionary.load_from_text(txtFile)
        tfidf_model = models.TfidfModel(dictionary=dct)
        tfidf_vector = tfidf_model[dct.doc2bow(newPatentSeries['Tokens'])]
        tfidfDict = dict(tfidf_vector)

        for word in newPatentSeries['Tokens']:
            index = dct.token2id.get(word)
            tfidfValue = tfidfDict.get(index)
            try:
                np.add(sum_words,self.model_words.wv[word])
                #sum_tfidf += [val * tfidfValue for val in self.model_words.wv[word]]
            except:
                pass
        for citation in newPatentSeries['TokenizedCitations']:
            try:
                np.add(sum_citations,self.model_citations.wv[citation])
            except:
                pass
        sum = np.concatenate((sum_words,sum_citations))
        newPatentSeries['Word2Vec']= sum

        matches = []
        for file in os.scandir(dirPath+"\w2v"):
            if file.is_file(): # To avoid entering the emb directory
                print("reading "+str(file))
                try:
                    dataframe = pd.io.json.read_json(file, orient='records', lines=True)
                except:
                    dataframe = pd.io.json.read_json(file, orient='records')
                for index,doc in dataframe.iterrows():
                    # print(doc['Word2Vec'])
                    # print(newPatentSeries['Word2Vec'])
                    similarity = 1 - scipy.spatial.distance.cosine(newPatentSeries['Word2Vec'], doc['Word2Vec'])
                    if similarity >= threshold:
                        matches.append((similarity, doc))
        print(str(len(matches))+" Matches found")
        return sorted(matches, key=lambda similarity: similarity[0], reverse=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mocka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
zpath = r'C:\Users\mocka\PycharmProjects\Patent-Prior-Art-Finder\Patent Queries\sampleZipSet'
#metapath = r"C:\Users\mocka\PycharmProjects\Patent-Prior-Art-Finder\Patent Queries\MetaDataDrive"
patpath = r"C:\Users\mocka\PycharmProjects\Patent-Prior-Art-Finder\Patent Queries\Identical4280631.json"
#twopath= r"C:\Users\mocka\PycharmProjects\Patent-Prior-Art-Finder\Patent Queries\Data Science Fixed Abstracts"
#zpath = r'C:\Users\zacha\OneDrive\Documents\Computer Science\Patent-Prior-Art-Finder\Patent Queries\sampleZipSet'
myPaf = _DevFilesPatentPriorArtFinder(zpath, publicationNumberColumnString='publication_number', comparisonColumnString="abstract_en", cit_col= "Citations")
newPat= pd.io.json.read_json(patpath, orient='records')
#myPaf.train()
out= (myPaf.compareNewPatent(newPatentSeries=newPat.iloc[0], dirPath=zpath, threshold=.9))
print(out)
print("///////////////////////////////////////////////////////////////////////////////////////////")


Didn't make directories
Initialization complete T=3155.4109511
reading <DirEntry 'bq-results-20210716-122855-k3ohqdlyn8nc (1).json.gz'>


  dist = 1.0 - uv / np.sqrt(uu * vv)


reading <DirEntry 'results-20210716-123228.json.gz'>
11000 Matches found
[(1, publication_number                                         US-4058776-A
Word2Vec              [6.365987374e-314, 2.121995791e-314, 2.1219957...
Name: 0, dtype: object), (1, publication_number                                     US-2004137729-A1
Word2Vec              [0.0, 1.038939254e-311, 0.0, 0.0, 0.0, 0.0, 0....
Name: 1, dtype: object), (1, publication_number                                         US-4280631-A
Word2Vec              [0.0, 1.038939255e-311, 0.0, 0.0, 0.0, 0.0, 0....
Name: 2, dtype: object), (1, publication_number                                        US-6854070-B2
Word2Vec              [0.0, 1.038939257e-311, 0.0, 0.0, 0.0, 0.0, 0....
Name: 3, dtype: object), (1, publication_number                                     US-2015010312-A1
Word2Vec              [6.365987374e-314, 2.121995791e-314, 2.1219957...
Name: 4, dtype: object), (1, publication_number                                       

In [33]:
print(myPaf.dictionary)

Dictionary(0 unique tokens: [])


In [None]:
file= patpath
try:
    dataframe= pd.io.json.read_json(file, orient = 'records', lines=True)
except:
    dataframe= pd.io.json.read_json(file, orient = 'records')
series= dataframe.iloc[0]

In [13]:
print(series["Abstract"])

The invention relates to an improvement in a child resistant liquid seal closure and container combination. The container neck has one or more locking projections which engage one or more locking lugs on the interior surface of the closure to prevent rotational removal of the closure from the container. To remove the closure, the cap skirt must be squeezed and distorted to disengage the lugs radially outwardly from the locking projections on the container neck. A snap cap liner is retained within the closure, adjacent the inside surface of the top panel. As the closure is rotated onto the container, this liner snaps into place onto the top of the container neck, and forms a primary liquid seal. Although the final orientation of the outer closure and bottle is effected by the relative position of the locking lugs and projecting cam, this orientation does not affect the primary seal formed by the internal snap cap liner.


In [31]:
series['Tokens']= myPaf._tokenizeText(series['Abstract'])
print("TOKENS")
print(series['Tokens'])
series["TF-IDF"] = [myPaf.tfidf_gensim[myPaf.corpus[x]] for x in range(0, len(myPaf.corpus))]
print("TfIdf")
print(series['TF-IDF'])

sum_words = np.empty(50)
sum_tfidf = np.empty(50)
tfidfDict = dict(series["TF-IDF"])
# Create a sum of the words in a given document to create a doc vector
# Maintain 2 such vectors: 1 plain, and another where each word vector is multiplied by the word's tfidf weight
for word in series['Tokens']:
    index = myPaf.dictionary.token2id.get(word)
    print(word + "\t Index: "+str(index))
    tfidfValue = tfidfDict.get(index)
    print("\t\t tfidf val: "+str(tfidfValue))

    try:
        sum_words= np.add(sum_words ,myPaf.model_words.wv[word])
        sum_tfidf= np.add(sum_tfidf,np.multiply(tfidfValue,myPaf.model_words[word]))
    except: # In case the model does not have a given word, ignore it.
        pass

TOKENS
['invention', 'relates', 'improvement', 'child', 'resistant', 'liquid', 'seal', 'closure', 'container', 'combination', 'container', 'neck', 'one', 'locking', 'projections', 'engage', 'one', 'locking', 'lugs', 'interior', 'surface', 'closure', 'prevent', 'rotational', 'removal', 'closure', 'container', 'remove', 'closure', 'cap', 'skirt', 'must', 'squeezed', 'distorted', 'disengage', 'lugs', 'radially', 'outwardly', 'locking', 'projections', 'container', 'neck', 'snap', 'cap', 'liner', 'retained', 'within', 'closure', 'adjacent', 'inside', 'surface', 'top', 'panel', 'closure', 'rotated', 'onto', 'container', 'liner', 'snaps', 'place', 'onto', 'top', 'container', 'neck', 'forms', 'primary', 'liquid', 'seal', 'although', 'final', 'orientation', 'outer', 'closure', 'bottle', 'effected', 'relative', 'position', 'locking', 'lugs', 'projecting', 'cam', 'orientation', 'affect', 'primary', 'seal', 'formed', 'internal', 'snap', 'cap', 'liner']
TfIdf
[]
invention	 Index: None
		 tfidf val: