<a href="https://colab.research.google.com/github/DGuilherme/BasicDT/blob/master/CH3_PatentRS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Challenge 3 Patent Reconmmender System


In [None]:
# Import section
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

# solve issue of gensim version
!pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/44/52/f1417772965652d4ca6f901515debcd9d6c5430969e8c02ee7737e6de61c/gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 169kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


# Import Dataset

*Dataset composition*   

| Feature        | Description           |
| -------------- | --------------------- |
| ID             | The patent ID         |
| Title          | Patent Title          |
| Abstract       | Patent Abstract       |
| Classification | [Patent Classification](https://www.uspto.gov/web/patents/classification/selectnumwithtitle.htm) |

## How to export from database


```
db.10000.find(
  {'classes.FSC': {$exists: true},title: {$exists: true},'abstract': {$exists: true}},
  {'abstract': 1,title: 1,'classes.FSC': 1}
)
```


```
[{$sample: {
  size: 10000
}}, {$project: { _id: {$toString: "$_id"}, abstract : 1, title : 1, "classes.FSC" :1}}, {$match: {"classes" :{"$exists":true},title:{"$exists":true},abstract:{"$exists":true}}}]
```


In [None]:
url = 'https://raw.githubusercontent.com/DGuilherme/Challenge3/main/Dataset/10000_classified_patents.json'


# Preprocessing 


In [None]:
from sklearn.model_selection import train_test_split

raw_train_data = pd.read_json(url)
raw_train_data = raw_train_data.rename(columns={'_id': 'ID', 'abstract': 'Resumo','title': 'Titulo'})
raw_train_data = raw_train_data.dropna()
raw_train_data = raw_train_data.drop_duplicates(subset ="Resumo",keep = False)
raw_train_data = raw_train_data.drop_duplicates(subset ="Titulo",keep = False)
train_classes_data = raw_train_data[['ID','classes']]
train_data_unsplit = raw_train_data[['ID','Titulo','Resumo']]

# Split dataset
train_data, test_data = train_test_split(train_data_unsplit, test_size=0.2)

2021-04-20 22:52:52,470 : INFO : NumExpr defaulting to 2 threads.


# Create the Vocabulary

In [None]:
modelIndexToDataframeIndex = []

import gensim

def tagData(dataframe):
  number = 0
  for index,row in dataframe.iterrows():
    number = number + 1
    modelIndexToDataframeIndex.append(row['ID'])
    resumotokens = gensim.utils.simple_preprocess(row['Resumo'])

    yield gensim.models.doc2vec.TaggedDocument(resumotokens, [number])

vocabulary = list(tagData(train_data))
vocabulary_test = list(tagData(test_data))



# User Question


# Create gensim Doc2Vec model

In [None]:
# instanciate
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=100) # Create inital empty model

# build
model.build_vocab(vocabulary) # Add data to the model

2021-04-20 22:53:07,629 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-04-20T22:53:07.629046', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-04-20 22:53:07,632 : INFO : collecting all words and their counts
2021-04-20 22:53:07,635 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-04-20 22:53:07,810 : INFO : collected 19326 word types and 5849 unique tags from a corpus of 5848 examples and 655404 words
2021-04-20 22:53:07,812 : INFO : Creating a fresh vocabulary
2021-04-20 22:53:07,884 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 12827 unique words (66.37172720687157%% of original 19326, drops 6499)', 'datetime': '2021-04-20T22:53:07.884114', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linu

# Model Train


In [None]:
model.train(vocabulary, total_examples=model.corpus_count, epochs=model.epochs)

2021-04-20 22:53:10,144 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 12827 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-04-20T22:53:10.143718', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'train'}
2021-04-20 22:53:11,172 : INFO : EPOCH 1 - PROGRESS: at 65.03% examples, 313999 words/s, in_qsize 5, out_qsize 0
2021-04-20 22:53:11,651 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-20 22:53:11,661 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-04-20 22:53:11,666 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-04-20 22:53:11,668 : INFO : EPOCH - 1 : training on 655404 raw words (492411 effective words) took 1.5s, 324811 effective words/s
2021-04-20 22:53:12,688 : INFO : EPOCH 2 - PROGRESS: at 66.57% examples, 323803 words

# Model Test
 

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
sample = train_data.sample()
print("ID: "+ sample.iloc[0]['ID'])
print("Resumo: "+ sample.iloc[0]['Titulo'])
print("Titulo: "+ sample.iloc[0]['Resumo'])
value = str(sample.iloc[0]['ID'])
fscList = train_classes_data.iloc[sample.index]['classes'].iloc[0]['FSC']
print("FSC: "+ str(fscList))




ID: 570e1777eb1ec9929baeffdb
Resumo: Apparatus for storing and moving a cassette
Titulo: A cassette stocker includes a plurality of cassette storage shelves positioned adjacent a cleanroom wall above a cassette docking station, and a cassette mover to carry a cassette between the shelves and the docking station. An interstation transfer apparatus includes an overhead support beam and a transfer arm adapted to carry a cassette between processing stations.
FSC: ['375', '331', '329']


In [None]:
inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(sample.iloc[0]['Resumo']))
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
best_match_row = train_data[train_data['ID'] == modelIndexToDataframeIndex[sims[1][0]-1]]
print("Similarity: "+ str(sims[1][1]))
print("ID: "+ best_match_row.iloc[0]['ID'])
print("Resumo: "+ best_match_row.iloc[0]['Titulo'])
print("Titulo: "+ best_match_row.iloc[0]['Resumo'])
value = str(best_match_row.iloc[0]['ID'])
fscList = train_classes_data.iloc[best_match_row.index]['classes'].iloc[0]['FSC']
print("FSC: "+ str(fscList))

Similarity: 0.5206594467163086
ID: 570641eaeb1ec9cd7cacdb5f
Resumo: Carriage for sorting-machines in particular, with independently actioned       tiltable plate
Titulo: A carriage for a transport and sorting machine having tracks and a carriage-dragging device therealong has a carriage support. The carriage support is dragged along the tracks by the carriage-dragging device, but a tiltable object-carrying plate on the carriage support is tilted by an electric motor on the carriage support and movable therewith.
FSC: ['162', '435']


In [None]:
def get_model_best_match(abstract):
  inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(abstract))
  return model.dv.most_similar([inferred_vector], topn=len(model.dv))

def get_FSC_classes_row(ID):
  best_match_classes = train_classes_data[train_classes_data['ID'] == ID]
  return best_match_classes['classes'].iloc[0]
  
def validate_parse_fsc(row_fsc):
  fscList = []
  if('FSC' in row_fsc):
    fscList = row_fsc['FSC']
    
    if(not isinstance(fscList,list)):
      fscList = [fscList]
    
  return fscList

def evaluate_classes():
  for index,row in test_data.iterrows():
    test_Data_sample = get_model_best_match(row['Resumo'])
    match_ID = str(modelIndexToDataframeIndex[test_Data_sample[0][0]-1])

    best_match_data = train_data[train_data['ID'] == match_ID]
    
    best_match_fsc_row = get_FSC_classes_row(match_ID)
    test_fsc_row = get_FSC_classes_row(row['ID'])
  
    fsc_list_best_match = validate_parse_fsc(best_match_fsc_row)
    fsc_test_data_row = validate_parse_fsc(test_fsc_row)

    print("Best Match FSC: " + str(fsc_list_best_match))
    print("Row FSC: " + str(fsc_test_data_row) + "\n")


evaluate_classes()

Best Match FSC: ['373']
Row FSC: ['128', '312', 'D 6', '4']

Best Match FSC: ['333', '343', '455', '329']
Row FSC: ['328', '307', '327', '326', '365', '377']

Best Match FSC: ['424']
Row FSC: ['438']

Best Match FSC: ['426']
Row FSC: ['428', '162', '156']

Best Match FSC: ['165', '417', '361']
Row FSC: ['175']

Best Match FSC: ['162', '209']
Row FSC: ['55', '209']

Best Match FSC: ['437', '148']
Row FSC: ['118', '156', '269', '165', '204']

Best Match FSC: ['165', '126', '62']
Row FSC: ['432', '34', '118', '134', '117']

Best Match FSC: ['71']
Row FSC: ['524', '252']

Best Match FSC: ['355', '356']
Row FSC: ['250', '362', '422', '118', '430']

Best Match FSC: ['204', '60', '123', '324', '422']
Row FSC: ['417', '92']

Best Match FSC: ['399', '219']
Row FSC: ['350', '359']

Best Match FSC: ['361']
Row FSC: ['501', '252']

Best Match FSC: ['400', '101']
Row FSC: ['101', '74']

Best Match FSC: ['310']
Row FSC: ['330']

Best Match FSC: ['514']
Row FSC: ['546', '544', '548', '558', '514', '4

KeyboardInterrupt: ignored

In [None]:
train_classes_data

Unnamed: 0,ID,classes
0,57065f86eb1ec950df04567a,"{'FSC': ['359', '372']}"
1,5702662eeb1ec9c195055491,{'FSC': '84'}
2,57026579eb1ec9b0a9bd00fe,"{'FSC': ['72', '29', '51', '408', '409']}"
3,570260bbeb1ec9244e6bdfe1,"{'FSC': ['210', '429']}"
4,57026060eb1ec91ddf3047ef,"{'FSC': ['144', '254', '173', '81']}"
...,...,...
7360,570261d1eb1ec93bc90e06ea,{'FSC': '424'}
7361,57066c52eb1ec98afebf7a43,{'FSC': '424'}
7362,57064db8eb1ec901ec1acac0,"{'FSC': ['53', '141']}"
7363,570261d1eb1ec93bc90de880,"{'FSC': ['248', '285']}"
