<a href="https://colab.research.google.com/github/DGuilherme/Challenge3/blob/main/CH3_PatentRS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Challenge 3 Patent Reconmmender System


In [14]:
# Import section
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

# solve issue of gensim version
!pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


# Import Dataset

*Dataset composition*   

| Feature        | Description           |
| -------------- | --------------------- |
| ID             | The patent ID         |
| Title          | Patent Title          |
| Abstract       | Patent Abstract       |
| Classification | [Patent Classification](https://www.uspto.gov/web/patents/classification/selectnumwithtitle.htm) |

## How to export from database


```
db.10000.find(
  {'classes.FSC': {$exists: true},title: {$exists: true},'abstract': {$exists: true}},
  {'abstract': 1,title: 1,'classes.FSC': 1}
)
```


```
[{$sample: {
  size: 10000
}}, {$project: { _id: {$toString: "$_id"}, abstract : 1, title : 1, "classes.FSC" :1}}, {$match: {"classes" :{"$exists":true},title:{"$exists":true},abstract:{"$exists":true}}}]
```


In [15]:
url = 'https://raw.githubusercontent.com/DGuilherme/Challenge3/main/Dataset/10000_classified_patents.json'


# Preprocessing 


In [16]:
from sklearn.model_selection import train_test_split

raw_train_data = pd.read_json(url)
raw_train_data = raw_train_data.rename(columns={'_id': 'ID', 'abstract': 'Resumo','title': 'Titulo'})
raw_train_data = raw_train_data.dropna()
raw_train_data = raw_train_data.drop_duplicates(subset ="Resumo",keep = False)
raw_train_data = raw_train_data.drop_duplicates(subset ="Titulo",keep = False)
train_classes_data = raw_train_data[['ID','classes']]
train_data_unsplit = raw_train_data[['ID','Titulo','Resumo']]

# Split dataset
train_data, test_data = train_test_split(train_data_unsplit, test_size=0.2)

In [17]:
train_data_unsplit.describe()

Unnamed: 0,ID,Titulo,Resumo
count,7311,7311,7311
unique,7311,7311,7311
top,57066c4eeb1ec98afebe694d,Metalation and functionalization of polymers a...,A powder lacquer binder composition which comp...
freq,1,1,1


In [18]:
raw_train_data.describe()

Unnamed: 0,Resumo,Titulo,classes,ID
count,7311,7311,7311,7311
unique,7311,7311,4252,7311
top,A powder lacquer binder composition which comp...,Metalation and functionalization of polymers a...,{'FSC': '430'},57066c4eeb1ec98afebe694d
freq,1,1,55,1


In [19]:
train_classes_data.describe()
print(train_classes_data)

                            ID                                    classes
0     57065f86eb1ec950df04567a                    {'FSC': ['359', '372']}
1     5702662eeb1ec9c195055491                              {'FSC': '84'}
2     57026579eb1ec9b0a9bd00fe  {'FSC': ['72', '29', '51', '408', '409']}
3     570260bbeb1ec9244e6bdfe1                    {'FSC': ['210', '429']}
4     57026060eb1ec91ddf3047ef       {'FSC': ['144', '254', '173', '81']}
...                        ...                                        ...
7360  570261d1eb1ec93bc90e06ea                             {'FSC': '424'}
7361  57066c52eb1ec98afebf7a43                             {'FSC': '424'}
7362  57064db8eb1ec901ec1acac0                     {'FSC': ['53', '141']}
7363  570261d1eb1ec93bc90de880                    {'FSC': ['248', '285']}
7364  57066c52eb1ec98afebf58ad                              {'FSC': '62'}

[7311 rows x 2 columns]


# Create the Vocabulary

In [20]:
modelIndexToDataframeIndex = []

import gensim

def tagData(dataframe):
  number = 0
  for index,row in dataframe.iterrows():
    number = number + 1
    modelIndexToDataframeIndex.append(row['ID'])
    resumotokens = gensim.utils.simple_preprocess(row['Resumo'])

    yield gensim.models.doc2vec.TaggedDocument(resumotokens, [number])

vocabulary = list(tagData(train_data))
vocabulary_test = list(tagData(test_data))



# User Question


# Create gensim Doc2Vec model

In [21]:
# instanciate
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=100) # Create inital empty model

# build
model.build_vocab(vocabulary) # Add data to the model

2021-04-24 15:46:07,212 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-04-24T15:46:07.212375', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-04-24 15:46:07,214 : INFO : collecting all words and their counts
2021-04-24 15:46:07,216 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-04-24 15:46:07,374 : INFO : collected 19135 word types and 5849 unique tags from a corpus of 5848 examples and 654411 words
2021-04-24 15:46:07,375 : INFO : Creating a fresh vocabulary
2021-04-24 15:46:07,440 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 12676 unique words (66.24510060099294%% of original 19135, drops 6459)', 'datetime': '2021-04-24T15:46:07.440196', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linu

# Model Train


In [22]:
model.train(vocabulary, total_examples=model.corpus_count, epochs=model.epochs)

2021-04-24 15:46:07,722 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 12676 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-04-24T15:46:07.722601', 'gensim': '4.0.1', 'python': '3.7.10 (default, Feb 20 2021, 21:17:23) \n[GCC 7.5.0]', 'platform': 'Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'train'}
2021-04-24 15:46:08,738 : INFO : EPOCH 1 - PROGRESS: at 60.38% examples, 296501 words/s, in_qsize 5, out_qsize 0
2021-04-24 15:46:09,316 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-24 15:46:09,337 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-04-24 15:46:09,353 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-04-24 15:46:09,356 : INFO : EPOCH - 1 : training on 654411 raw words (491185 effective words) took 1.6s, 302707 effective words/s
2021-04-24 15:46:10,400 : INFO : EPOCH 2 - PROGRESS: at 65.01% examples, 308814 words

# Model Test
 

In [23]:
# Pick a random document from the test corpus and infer a vector from the model
sample = train_data.sample()
print("ID: "+ sample.iloc[0]['ID'])
print("Resumo: "+ sample.iloc[0]['Titulo'])
print("Titulo: "+ sample.iloc[0]['Resumo'])
value = str(sample.iloc[0]['ID'])
fscList = train_classes_data.iloc[sample.index]['classes'].iloc[0]['FSC']
print("FSC: "+ str(fscList))




ID: 5706821beb1ec9ebf6b44742
Resumo: Aluminum offset coil, and method for its production
Titulo: An aluminum offset coil, having a surface zone of a predominantly recrystallized globular, fine grain structure, and a core zone of a greatly work-hardened structure, for use in offset printing plates, and a process for its preparation.
FSC: ['74', '364', '192', '123']


In [24]:
inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(sample.iloc[0]['Resumo']))
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
best_match_row = train_data[train_data['ID'] == modelIndexToDataframeIndex[sims[1][0]-1]]
print("Similarity: "+ str(sims[1][1]))
print("ID: "+ best_match_row.iloc[0]['ID'])
print("Resumo: "+ best_match_row.iloc[0]['Titulo'])
print("Titulo: "+ best_match_row.iloc[0]['Resumo'])
value = str(best_match_row.iloc[0]['ID'])
fscList = train_classes_data.iloc[best_match_row.index]['classes'].iloc[0]['FSC']
print("FSC: "+ str(fscList))

Similarity: 0.4567660987377167
ID: 570641eaeb1ec9cd7cacc6f0
Resumo: Hydrotreating catalysts prepared from hydrogels
Titulo: This invention relates to a process for preparing highly active hydrotreating catalysts prepared by incorporating cobalt and an element selected from the group consisting of molybdenum, tungsten and mixtures thereof, into a phosphated alumina hydrogel support. The final calcined catalysts have surface areas greater than about 300 m.sup.2 /g, flat plate crush strengths greater than about 18 lbs and more than about 80% of their pore volume in pores having diameters less than about 70 .ANG..
FSC: ['347', '399']


In [25]:
def get_model_best_match(abstract):
  inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(abstract))
  return model.dv.most_similar([inferred_vector], topn=len(model.dv))

def get_FSC_classes_row(ID):
  best_match_classes = train_classes_data[train_classes_data['ID'] == ID]
  return best_match_classes['classes'].iloc[0]
  
def validate_parse_fsc(row_fsc):
  fscList = []
  if('FSC' in row_fsc):
    fscList = row_fsc['FSC']
    
    if(not isinstance(fscList,list)):
      fscList = [fscList]
    
  return fscList

def evaluate_classes():
  for index,row in test_data.iterrows():
    test_Data_sample = get_model_best_match(row['Resumo'])
    match_ID = str(modelIndexToDataframeIndex[test_Data_sample[0][0]-1])

    best_match_data = train_data[train_data['ID'] == match_ID]
    
    best_match_fsc_row = get_FSC_classes_row(match_ID)
    test_fsc_row = get_FSC_classes_row(row['ID'])
  
    fsc_list_best_match = validate_parse_fsc(best_match_fsc_row)
    fsc_test_data_row = validate_parse_fsc(test_fsc_row)

    print("Best Match FSC: " + str(fsc_list_best_match))
    print("Row FSC: " + str(fsc_test_data_row) + "\n")


evaluate_classes()

Best Match FSC: ['399', '271']
Row FSC: ['250', '318', '356', '156']

Best Match FSC: ['134', '49', '160']
Row FSC: ['119', '229']

Best Match FSC: ['280', '297']
Row FSC: ['280', '403']

Best Match FSC: ['235', '271']
Row FSC: ['428']

Best Match FSC: ['424']
Row FSC: ['55', '134', '210', '252', '427']

Best Match FSC: ['8', '428', '503']
Row FSC: ['96', '428']

Best Match FSC: ['568']
Row FSC: ['528']

Best Match FSC: ['260']
Row FSC: ['576', '514']

Best Match FSC: ['362', '40', '52', '431', '206']
Row FSC: ['220', '206']

Best Match FSC: ['556']
Row FSC: ['72', '29', '451']

Best Match FSC: ['423']
Row FSC: ['442']

Best Match FSC: ['252', '8', '510']
Row FSC: ['260']

Best Match FSC: ['530']
Row FSC: ['530']

Best Match FSC: ['544', '504']
Row FSC: ['296', '307', '318']

Best Match FSC: ['246', '73']
Row FSC: ['441', '114']

Best Match FSC: ['544', '8']
Row FSC: ['534']

Best Match FSC: ['180', '317', '123']
Row FSC: ['244', '62', '165', '123', '60']

Best Match FSC: ['375', '455'

## Calculate Precision and Recall

Precision = (n of recommended items that are relevant) / (n of recommended items )

Recall = (n of recommended items that are relevant) / (total n of relevant items)

In [33]:
train_classes_data

# For an item to be considered relevant needs to have at least 50% of classes similar to the input's classes 
def getNumberOfRelevantItems(recommendedClasses, inputClasses):
  
  if pd.Series(recommendedClasses).isin(inputClasses).any():
    print("Contem!")
    print(recommendedClasses)
    print(inputClasses)
    print("\n")
  

def evaluate_classes():
  for index,row in test_data.iterrows():
    test_Data_sample = get_model_best_match(row['Resumo'])
    match_ID = str(modelIndexToDataframeIndex[test_Data_sample[0][0]-1])

    best_match_data = train_data[train_data['ID'] == match_ID]
    
    best_match_fsc_row = get_FSC_classes_row(match_ID)
    test_fsc_row = get_FSC_classes_row(row['ID'])
  
    fsc_list_best_match = validate_parse_fsc(best_match_fsc_row)
    fsc_test_data_row = validate_parse_fsc(test_fsc_row)

    #print("Best Match FSC: " + str(fsc_list_best_match))
    #print("Row FSC: " + str(fsc_test_data_row) + "\n")

    recommended_classes_array = np.array(fsc_list_best_match)
    input_classes_array = np.array(fsc_test_data_row)
    getNumberOfRelevantItems(recommended_classes_array,input_classes_array)

evaluate_classes()


def getPrecision():


  return 


def getRecall():

  return 


Contem!
['8' '428' '503']
['96' '428']


Contem!
['362' '40' '52' '431' '206']
['220' '206']


Contem!
['530']
['530']


Contem!
['180' '317' '123']
['244' '62' '165' '123' '60']


Contem!
['424']
['424' '514']


Contem!
['364' '371']
['364' '318' '425' '264' '29']


Contem!
['514']
['424' '514']


Contem!
['540' '544' '546' '549' '514']
['548' '514']


Contem!
['Plt']
['Plt']


Contem!
['375' '371' '370' '455' '324']
['455' '334']


Contem!
['250' '257']
['422' '73' '250' '324']


Contem!
['206' '220']
['206' '116' '246' '40']


Contem!
['74' '192']
['192']


Contem!
['430' '369' '359' '428']
['430' '358']


Contem!
['365' '371']
['365']


Contem!
['315']
['315' '327' '363']


Contem!
['435' '530']
['435' '536']


Contem!
['424' '514']
['514' '435' '536' '935']


Contem!
['29' '174' '437' '264' '257']
['257' '437' '365' '438']


Contem!
['439' '385']
['439']


Contem!
['428' '347']
['428' '525']


Contem!
['430' '525' '526']
['96' '204' '427' '526' '264']


Contem!
['455' '340' '434']

KeyboardInterrupt: ignored

In [27]:
train_classes_data

Unnamed: 0,ID,classes
0,57065f86eb1ec950df04567a,"{'FSC': ['359', '372']}"
1,5702662eeb1ec9c195055491,{'FSC': '84'}
2,57026579eb1ec9b0a9bd00fe,"{'FSC': ['72', '29', '51', '408', '409']}"
3,570260bbeb1ec9244e6bdfe1,"{'FSC': ['210', '429']}"
4,57026060eb1ec91ddf3047ef,"{'FSC': ['144', '254', '173', '81']}"
...,...,...
7360,570261d1eb1ec93bc90e06ea,{'FSC': '424'}
7361,57066c52eb1ec98afebf7a43,{'FSC': '424'}
7362,57064db8eb1ec901ec1acac0,"{'FSC': ['53', '141']}"
7363,570261d1eb1ec93bc90de880,"{'FSC': ['248', '285']}"
