In [1]:
import json 
import pandas as pd 
from pandas.io.json import json_normalize
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split 

[nltk_data] Downloading package stopwords to /Users/ankit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ankit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df=pd.read_json('Downloads/uspto.json',lines=True)

In [3]:
objectdata=df.object.dropna().apply(pd.Series)

## Labelling the data as per their status

In [4]:
status=objectdata.status.unique()

In [5]:
status

array(['Publications -- Issue Fee Payment Verified',
       'Non Final Action Mailed', 'Awaiting TC Resp., Issue Fee Not Paid',
       'Final Rejection Mailed',
       'Docketed New Case - Ready for Examination', 'Patented Case',
       'Sent to Classification contractor',
       'Abandoned  --  Failure to Respond to an Office Action',
       'Advisory Action Mailed',
       "Examiner's Answer to Appeal Brief Mailed",
       'Response to Non-Final Office Action Entered and Forwarded to Examiner',
       'Patent Expired Due to NonPayment of Maintenance Fees Under 37 CFR 1.362',
       'Appeal Brief (or Supplemental Brief) Entered and Forwarded to Examiner',
       'Application Dispatched from Preexam, Not Yet Docketed',
       'RO PROCESSING COMPLETED-PLACED IN STORAGE',
       'PCT - International Search Report Mailed to IB',
       'Provisional Application Expired',
       'Application Undergoing Preexam Processing',
       'Expressly Abandoned  --  During Examination'], dtype=object)

In [6]:
approved=[status[0],status[2],status[5]]
approved

['Publications -- Issue Fee Payment Verified',
 'Awaiting TC Resp., Issue Fee Not Paid',
 'Patented Case']

In [7]:
disapproval=[status[1],status[3],status[7],status[8],status[10],status[11],status[16],status[18]]
disapproval

['Non Final Action Mailed',
 'Final Rejection Mailed',
 'Abandoned  --  Failure to Respond to an Office Action',
 'Advisory Action Mailed',
 'Response to Non-Final Office Action Entered and Forwarded to Examiner',
 'Patent Expired Due to NonPayment of Maintenance Fees Under 37 CFR 1.362',
 'Provisional Application Expired',
 'Expressly Abandoned  --  During Examination']

In [8]:
inprocess=[status[4],status[6],status[9],status[12],status[13],status[14],status[15],status[17]]
inprocess

['Docketed New Case - Ready for Examination',
 'Sent to Classification contractor',
 "Examiner's Answer to Appeal Brief Mailed",
 'Appeal Brief (or Supplemental Brief) Entered and Forwarded to Examiner',
 'Application Dispatched from Preexam, Not Yet Docketed',
 'RO PROCESSING COMPLETED-PLACED IN STORAGE',
 'PCT - International Search Report Mailed to IB',
 'Application Undergoing Preexam Processing']

In [9]:
objectdata['result']=[None for i in range(len(objectdata['status']))]

In [10]:
for i in range(len(objectdata['status'])):
    if objectdata['status'][i] in approved:
        objectdata['result'][i]=0
    elif objectdata['status'][i] in disapproval:
        objectdata['result'][i]=1
    else:
        objectdata['result'][i]=2

## Data Cleaning

In [11]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [12]:
def get_top_n_words(corpus, n=None,reverse=True):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=reverse)
    remwords= words_freq[:n]
    return [i[0] for i in remwords]

In [13]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #word_free = " ".join([i for i in normalized.split() if i not in common_words])
    #word_freehigh = " ".join([i for i in word_free.split() if i not in removewords])
    nonnumeric = re.sub(r"\d+","",normalized)
    processed = re.sub(r'\W*\b\w{1,4}\b',"",nonnumeric)
    #y = processed.split()
    return processed

In [14]:
objectdata['summary']=objectdata['summary'].apply(clean)

In [15]:
objectdata['title']=objectdata['title'].apply(clean)

In [16]:
common_words = get_top_n_words(objectdata['summary']+objectdata['title'],30)

In [17]:
common_words

['lidar',
 'system',
 'laser',
 'signal',
 'light',
 'method',
 'point',
 'target',
 'optical',
 'object',
 'source',
 'pulse',
 'image',
 'sensor',
 'using',
 'includes',
 'measurement',
 'first',
 'second',
 'detection',
 'device',
 'surface',
 'système',
 'position',
 'detector',
 'invention',
 'array',
 'information',
 'configured',
 'reflected']

In [18]:
def removewords(doc):
    word_free = " ".join([i for i in doc.split() if i not in common_words])
    return word_free

In [19]:
objectdata['summary']=objectdata['summary'].apply(removewords)
objectdata['title']=objectdata['title'].apply(removewords)

## Sorting by Date & Time

In [20]:
objectdata['filingDate']=objectdata['filingDate'].apply(lambda x: np.NaN if (len(x)<1) else x)

In [21]:
objectdata['filingDate']=objectdata['filingDate'].apply(lambda x: datetime.datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y") if isinstance(x,str) else x) 

In [22]:
objectdata=objectdata.sort_values(by='filingDate')

In [23]:
objectdata=objectdata.reset_index(drop=True)

In [28]:
objectdata.head(10)

Unnamed: 0,id,title,summary,status,filingDate,publicationNumber,publicationDate,objectType,result
0,6248629,removal background backscattering subtraction ...,,Patent Expired Due to NonPayment of Maintenanc...,1981-03-27 13:00:00+00:00,,,patent,1
1,7146093,control photomultiplier detecting differential...,,Patented Case,1988-01-20 13:00:00+00:00,,,patent,0
2,7216341,imaging,,Patented Case,1988-07-07 11:00:00+00:00,,,patent,0
3,7256778,imaging nonvisible,,Patent Expired Due to NonPayment of Maintenanc...,1988-10-12 11:00:00+00:00,,,patent,1
4,7348746,,,Expressly Abandoned -- During Examination,1989-05-08 11:00:00+00:00,,,patent,1
5,7364860,incorporating multiple camera obtaining plural...,,Patent Expired Due to NonPayment of Maintenanc...,1989-06-12 11:00:00+00:00,,,patent,1
6,7416652,arrangement measuring atmospheric turbidity,,Patented Case,1989-10-03 11:00:00+00:00,,,patent,0
7,7420247,imaging,,Patented Case,1989-10-12 11:00:00+00:00,,,patent,0
8,7499068,scanning,,Patented Case,1990-03-23 13:00:00+00:00,,,patent,0
9,7539230,midinfrared hydrocarbon,,Patented Case,1990-06-18 11:00:00+00:00,,,patent,0


## Creating TfIdf Sparse Matrices

In [24]:
vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
summary = vectorizer.fit_transform(objectdata['summary'])
title=vectorizer.fit_transform(objectdata['title'])

## Cosine Similarity of a document with all other documents

In [25]:
def cosine_similarity(x,y):
    return y.dot(x.T)

## Predicting status based on Similarity with historical data of Patents

In [26]:
def predict(doc):
    doc['title']=doc['title'].apply(clean)
    doc['summary']=doc['summary'].apply(clean)
    doc['title']=doc['title'].apply(removewords)
    doc['summary']=doc['summary'].apply(removewords)
    docsum=vectorizer.transform(doc['summary'])
    doctitle=vectorizer.transform(doc['title'])
    similar_titles=cosine_similarity(title,doctitle[0]).toarray()
    similar_summary=cosine_similarity(summary,docsum[0]).toarray()
    simtitle={}
    for x,i in np.ndenumerate(similar_titles):
        if i>0.5:
            simtitle[x[1]]=i
    simsum={}
    for x,i in np.ndenumerate(similar_summary):
        if i>0.5:
            simsum[x[1]]=i
    if len(simtitle)>0:
        for i in simtitle.keys():
            if objectdata.loc[i,'result']==0:
                return 1
    if len(simsum)>0:
        for i in simsum.keys():
            if objectdata.loc[i,'result']==0:
                return 1
    if len(simtitle)==0 and len(sumsum)==0:
        return 0
    else:
        return 2

## API

In [27]:
from flask import Flask,request
  
app = Flask(__name__) 
  
@app.route('/patentpredict', methods=["POST"]) 
def predict_patent(): 
    req = request.get_json()
    doc=pd.DataFrame(req['object'],index=[0])
    res=predict(doc)
    if res==0:
        return "Approved"
    elif res==1:
        return "Not approved"
    else:
        return "In review"

@app.route('/patentrecord',methods=["POST"])
def record_patent():
    req = request.get_json()
    doc=pd.DataFrame(req['object'],index=[0])
    if doc['status'] in approved:
        doc['result']=[0]
    elif doc['status'] in disapproval:
        doc['result']=[1]
    else:
        doc['result']=2
    objectdata.append(doc)
    return "Submitted"
if __name__ == '__main__': 
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
