In [2]:
import pandas as pd
import numpy as np
import pickle
import gensim
import nltk
import json
import re

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

#### Cleaning functions

In [35]:
def clean_up(s):
    words = re.findall('[^\d\W]+', str(s))
    words = [w.lower() for w in words if not w.startswith('http://') and len(w)>2]
    return ' '.join(words)
 
def stopwords(s):
    stop = pd.read_csv('stop.txt', sep=' ')['stopwords']
    return [word for word in s if word not in stop]

# tagging
def tagging(l):
    # importing trainned tagger (for portuguese)
    f = open('tagger.pickle', 'rb')
    tagger = pickle.load(f)
    f.close()
    tagged = tagger.tag(l)
    return [w for w,t in tagged if t == 'NOUN' or t == 'ADJ']

def set_up(s):
    s = clean_up(s)
    s = stopwords(s)
    return tagging(s)

#### Doc2Vec training function

In [4]:
# to bypass an issue with the 64bit python version, i had to add the following function
def hash32(value):
    return hash(value) & 0xffffffff


# the funciont will take the word Series to train the model doc by doc
def doc2vec_trainer(ws, model_doc, fname, c=0):
    for i in range(len(ws)):
        model_doc.train([TaggedDocument(ws[i], [c])], total_examples=model_doc.corpus_count,
                         epochs=50, report_delay=99999)
        model_doc.alpha -= 0.0002 # adjusting the learning rate
        model_doc.save(fname)
        c+=1
        print('Learned {} from {}!'.format(c, len(ws)))
    print('Success! Model updated.')
    return model_doc

#### Taking only meaningfull words
Only stemming is not enough, will use the pos_tags trained to choose only meaningfull words, giving a better search result.

In [None]:
# importing and pre processing data
with open('labelled.json') as json_data:
    data = json.load(json_data)
    
df = pd.DataFrame.from_dict(data)

In [48]:
allwords = df['main'].apply(clean_up).str.split()

[['acordo',
  'com',
  'despacho',
  'proferido',
  'nos',
  'autos',
  'presente',
  'caso',
  'que',
  'decisão',
  'condenação',
  'por',
  'litigância',
  'integra',
  'decisão',
  'que',
  'põe',
  'termo',
  'processo',
  'não',
  'está',
  'causa',
  'uma',
  'decisão',
  'interlocutória',
  'mas',
  'antes',
  'uma',
  'parte',
  'extensão',
  'decisão',
  'mérito',
  'assim',
  'ainda',
  'que',
  'não',
  'ignore',
  'existirem',
  'esse',
  'respeito',
  'divergências',
  'jurisprudência',
  'stj',
  'considerou',
  'ser',
  'decisão',
  'recorrível',
  'nos',
  'termos',
  'gerais',
  'art',
  'cpc',
  'incindindo',
  'decisão',
  'condenação',
  'por',
  'litigância',
  'sobre',
  'representantes',
  'legais',
  'autora',
  'actos',
  'notificação',
  'própria',
  'sociedade',
  'autora',
  'não',
  'podem',
  'valer',
  'como',
  'notificação',
  'pessoal',
  'dos',
  'seus',
  'representantes',
  'legais',
  'nem',
  'tampouco',
  'conhecimento',
  'pessoal',
  'que',
  

In [6]:
with open('tag_stems.json') as json_data:
    data_stem = json.load(json_data)
    
stemmed = {k:' '.join(v) for k,v in data_stem.items()}
df_stem = pd.DataFrame.from_dict(stemmed, orient='index')
words_stem = df_stem[0].apply(lambda x: x.split())

#### Training Doc2Vec

In [49]:
# training

model_doc = Doc2Vec([TaggedDocument(allwords[0], [0])], vector_size=300, window=5, min_count=1, dbow_words=1,
                                                          workers=3, hashfxn=hash32, alpha=0.001, min_alpha=0.00025)

In [50]:
fname = "d2v_trainned_allwords.model"
model_doc.save(fname)

In [51]:
doc2vec_trainer(allwords, model_doc, fname)
#doc2vec_trainer(words_stem, words_stem, model_doc, fname)

Learned 1 from 9848!
Learned 2 from 9848!
Learned 3 from 9848!
Learned 4 from 9848!
Learned 5 from 9848!
Learned 6 from 9848!
Learned 7 from 9848!
Learned 8 from 9848!
Learned 9 from 9848!
Learned 10 from 9848!
Learned 11 from 9848!
Learned 12 from 9848!
Learned 13 from 9848!
Learned 14 from 9848!
Learned 15 from 9848!
Learned 16 from 9848!
Learned 17 from 9848!
Learned 18 from 9848!
Learned 19 from 9848!
Learned 20 from 9848!
Learned 21 from 9848!
Learned 22 from 9848!
Learned 23 from 9848!
Learned 24 from 9848!
Learned 25 from 9848!
Learned 26 from 9848!
Learned 27 from 9848!
Learned 28 from 9848!
Learned 29 from 9848!
Learned 30 from 9848!
Learned 31 from 9848!
Learned 32 from 9848!
Learned 33 from 9848!
Learned 34 from 9848!
Learned 35 from 9848!
Learned 36 from 9848!
Learned 37 from 9848!
Learned 38 from 9848!
Learned 39 from 9848!
Learned 40 from 9848!
Learned 41 from 9848!
Learned 42 from 9848!
Learned 43 from 9848!
Learned 44 from 9848!
Learned 45 from 9848!
Learned 46 from 984

Learned 362 from 9848!
Learned 363 from 9848!
Learned 364 from 9848!
Learned 365 from 9848!
Learned 366 from 9848!
Learned 367 from 9848!
Learned 368 from 9848!
Learned 369 from 9848!
Learned 370 from 9848!
Learned 371 from 9848!
Learned 372 from 9848!
Learned 373 from 9848!
Learned 374 from 9848!
Learned 375 from 9848!
Learned 376 from 9848!
Learned 377 from 9848!
Learned 378 from 9848!
Learned 379 from 9848!
Learned 380 from 9848!
Learned 381 from 9848!
Learned 382 from 9848!
Learned 383 from 9848!
Learned 384 from 9848!
Learned 385 from 9848!
Learned 386 from 9848!
Learned 387 from 9848!
Learned 388 from 9848!
Learned 389 from 9848!
Learned 390 from 9848!
Learned 391 from 9848!
Learned 392 from 9848!
Learned 393 from 9848!
Learned 394 from 9848!
Learned 395 from 9848!
Learned 396 from 9848!
Learned 397 from 9848!
Learned 398 from 9848!
Learned 399 from 9848!
Learned 400 from 9848!
Learned 401 from 9848!
Learned 402 from 9848!
Learned 403 from 9848!
Learned 404 from 9848!
Learned 405

Learned 720 from 9848!
Learned 721 from 9848!
Learned 722 from 9848!
Learned 723 from 9848!
Learned 724 from 9848!
Learned 725 from 9848!
Learned 726 from 9848!
Learned 727 from 9848!
Learned 728 from 9848!
Learned 729 from 9848!
Learned 730 from 9848!
Learned 731 from 9848!
Learned 732 from 9848!
Learned 733 from 9848!
Learned 734 from 9848!
Learned 735 from 9848!
Learned 736 from 9848!
Learned 737 from 9848!
Learned 738 from 9848!
Learned 739 from 9848!
Learned 740 from 9848!
Learned 741 from 9848!
Learned 742 from 9848!
Learned 743 from 9848!
Learned 744 from 9848!
Learned 745 from 9848!
Learned 746 from 9848!
Learned 747 from 9848!
Learned 748 from 9848!
Learned 749 from 9848!
Learned 750 from 9848!
Learned 751 from 9848!
Learned 752 from 9848!
Learned 753 from 9848!
Learned 754 from 9848!
Learned 755 from 9848!
Learned 756 from 9848!
Learned 757 from 9848!
Learned 758 from 9848!
Learned 759 from 9848!
Learned 760 from 9848!
Learned 761 from 9848!
Learned 762 from 9848!
Learned 763

Learned 1074 from 9848!
Learned 1075 from 9848!
Learned 1076 from 9848!
Learned 1077 from 9848!
Learned 1078 from 9848!
Learned 1079 from 9848!
Learned 1080 from 9848!
Learned 1081 from 9848!
Learned 1082 from 9848!
Learned 1083 from 9848!
Learned 1084 from 9848!
Learned 1085 from 9848!
Learned 1086 from 9848!
Learned 1087 from 9848!
Learned 1088 from 9848!
Learned 1089 from 9848!
Learned 1090 from 9848!
Learned 1091 from 9848!
Learned 1092 from 9848!
Learned 1093 from 9848!
Learned 1094 from 9848!
Learned 1095 from 9848!
Learned 1096 from 9848!
Learned 1097 from 9848!
Learned 1098 from 9848!
Learned 1099 from 9848!
Learned 1100 from 9848!
Learned 1101 from 9848!
Learned 1102 from 9848!
Learned 1103 from 9848!
Learned 1104 from 9848!
Learned 1105 from 9848!
Learned 1106 from 9848!
Learned 1107 from 9848!
Learned 1108 from 9848!
Learned 1109 from 9848!
Learned 1110 from 9848!
Learned 1111 from 9848!
Learned 1112 from 9848!
Learned 1113 from 9848!
Learned 1114 from 9848!
Learned 1115 fro

Learned 1417 from 9848!
Learned 1418 from 9848!
Learned 1419 from 9848!
Learned 1420 from 9848!
Learned 1421 from 9848!
Learned 1422 from 9848!
Learned 1423 from 9848!
Learned 1424 from 9848!
Learned 1425 from 9848!
Learned 1426 from 9848!
Learned 1427 from 9848!
Learned 1428 from 9848!
Learned 1429 from 9848!
Learned 1430 from 9848!
Learned 1431 from 9848!
Learned 1432 from 9848!
Learned 1433 from 9848!
Learned 1434 from 9848!
Learned 1435 from 9848!
Learned 1436 from 9848!
Learned 1437 from 9848!
Learned 1438 from 9848!
Learned 1439 from 9848!
Learned 1440 from 9848!
Learned 1441 from 9848!
Learned 1442 from 9848!
Learned 1443 from 9848!
Learned 1444 from 9848!
Learned 1445 from 9848!
Learned 1446 from 9848!
Learned 1447 from 9848!
Learned 1448 from 9848!
Learned 1449 from 9848!
Learned 1450 from 9848!
Learned 1451 from 9848!
Learned 1452 from 9848!
Learned 1453 from 9848!
Learned 1454 from 9848!
Learned 1455 from 9848!
Learned 1456 from 9848!
Learned 1457 from 9848!
Learned 1458 fro

Learned 1760 from 9848!
Learned 1761 from 9848!
Learned 1762 from 9848!
Learned 1763 from 9848!
Learned 1764 from 9848!
Learned 1765 from 9848!
Learned 1766 from 9848!
Learned 1767 from 9848!
Learned 1768 from 9848!
Learned 1769 from 9848!
Learned 1770 from 9848!
Learned 1771 from 9848!
Learned 1772 from 9848!
Learned 1773 from 9848!
Learned 1774 from 9848!
Learned 1775 from 9848!
Learned 1776 from 9848!
Learned 1777 from 9848!
Learned 1778 from 9848!
Learned 1779 from 9848!
Learned 1780 from 9848!
Learned 1781 from 9848!
Learned 1782 from 9848!
Learned 1783 from 9848!
Learned 1784 from 9848!
Learned 1785 from 9848!
Learned 1786 from 9848!
Learned 1787 from 9848!
Learned 1788 from 9848!
Learned 1789 from 9848!
Learned 1790 from 9848!
Learned 1791 from 9848!
Learned 1792 from 9848!
Learned 1793 from 9848!
Learned 1794 from 9848!
Learned 1795 from 9848!
Learned 1796 from 9848!
Learned 1797 from 9848!
Learned 1798 from 9848!
Learned 1799 from 9848!
Learned 1800 from 9848!
Learned 1801 fro

Learned 2103 from 9848!
Learned 2104 from 9848!
Learned 2105 from 9848!
Learned 2106 from 9848!
Learned 2107 from 9848!
Learned 2108 from 9848!
Learned 2109 from 9848!
Learned 2110 from 9848!
Learned 2111 from 9848!
Learned 2112 from 9848!
Learned 2113 from 9848!
Learned 2114 from 9848!
Learned 2115 from 9848!
Learned 2116 from 9848!
Learned 2117 from 9848!
Learned 2118 from 9848!
Learned 2119 from 9848!
Learned 2120 from 9848!
Learned 2121 from 9848!
Learned 2122 from 9848!
Learned 2123 from 9848!
Learned 2124 from 9848!
Learned 2125 from 9848!
Learned 2126 from 9848!
Learned 2127 from 9848!
Learned 2128 from 9848!
Learned 2129 from 9848!
Learned 2130 from 9848!
Learned 2131 from 9848!
Learned 2132 from 9848!
Learned 2133 from 9848!
Learned 2134 from 9848!
Learned 2135 from 9848!
Learned 2136 from 9848!
Learned 2137 from 9848!
Learned 2138 from 9848!
Learned 2139 from 9848!
Learned 2140 from 9848!
Learned 2141 from 9848!
Learned 2142 from 9848!
Learned 2143 from 9848!
Learned 2144 fro

Learned 2445 from 9848!
Learned 2446 from 9848!
Learned 2447 from 9848!
Learned 2448 from 9848!
Learned 2449 from 9848!
Learned 2450 from 9848!
Learned 2451 from 9848!
Learned 2452 from 9848!
Learned 2453 from 9848!
Learned 2454 from 9848!
Learned 2455 from 9848!
Learned 2456 from 9848!
Learned 2457 from 9848!
Learned 2458 from 9848!
Learned 2459 from 9848!
Learned 2460 from 9848!
Learned 2461 from 9848!
Learned 2462 from 9848!
Learned 2463 from 9848!
Learned 2464 from 9848!
Learned 2465 from 9848!
Learned 2466 from 9848!
Learned 2467 from 9848!
Learned 2468 from 9848!
Learned 2469 from 9848!
Learned 2470 from 9848!
Learned 2471 from 9848!
Learned 2472 from 9848!
Learned 2473 from 9848!
Learned 2474 from 9848!
Learned 2475 from 9848!
Learned 2476 from 9848!
Learned 2477 from 9848!
Learned 2478 from 9848!
Learned 2479 from 9848!
Learned 2480 from 9848!
Learned 2481 from 9848!
Learned 2482 from 9848!
Learned 2483 from 9848!
Learned 2484 from 9848!
Learned 2485 from 9848!
Learned 2486 fro

Learned 2788 from 9848!
Learned 2789 from 9848!
Learned 2790 from 9848!
Learned 2791 from 9848!
Learned 2792 from 9848!
Learned 2793 from 9848!
Learned 2794 from 9848!
Learned 2795 from 9848!
Learned 2796 from 9848!
Learned 2797 from 9848!
Learned 2798 from 9848!
Learned 2799 from 9848!
Learned 2800 from 9848!
Learned 2801 from 9848!
Learned 2802 from 9848!
Learned 2803 from 9848!
Learned 2804 from 9848!
Learned 2805 from 9848!
Learned 2806 from 9848!
Learned 2807 from 9848!
Learned 2808 from 9848!
Learned 2809 from 9848!
Learned 2810 from 9848!
Learned 2811 from 9848!
Learned 2812 from 9848!
Learned 2813 from 9848!
Learned 2814 from 9848!
Learned 2815 from 9848!
Learned 2816 from 9848!
Learned 2817 from 9848!
Learned 2818 from 9848!
Learned 2819 from 9848!
Learned 2820 from 9848!
Learned 2821 from 9848!
Learned 2822 from 9848!
Learned 2823 from 9848!
Learned 2824 from 9848!
Learned 2825 from 9848!
Learned 2826 from 9848!
Learned 2827 from 9848!
Learned 2828 from 9848!
Learned 2829 fro

Learned 3130 from 9848!
Learned 3131 from 9848!
Learned 3132 from 9848!
Learned 3133 from 9848!
Learned 3134 from 9848!
Learned 3135 from 9848!
Learned 3136 from 9848!
Learned 3137 from 9848!
Learned 3138 from 9848!
Learned 3139 from 9848!
Learned 3140 from 9848!
Learned 3141 from 9848!
Learned 3142 from 9848!
Learned 3143 from 9848!
Learned 3144 from 9848!
Learned 3145 from 9848!
Learned 3146 from 9848!
Learned 3147 from 9848!
Learned 3148 from 9848!
Learned 3149 from 9848!
Learned 3150 from 9848!
Learned 3151 from 9848!
Learned 3152 from 9848!
Learned 3153 from 9848!
Learned 3154 from 9848!
Learned 3155 from 9848!
Learned 3156 from 9848!
Learned 3157 from 9848!
Learned 3158 from 9848!
Learned 3159 from 9848!
Learned 3160 from 9848!
Learned 3161 from 9848!
Learned 3162 from 9848!
Learned 3163 from 9848!
Learned 3164 from 9848!
Learned 3165 from 9848!
Learned 3166 from 9848!
Learned 3167 from 9848!
Learned 3168 from 9848!
Learned 3169 from 9848!
Learned 3170 from 9848!
Learned 3171 fro

PermissionError: [Errno 13] Permission denied: 'd2v_trainned_allwords.model'

In [41]:
model_doc = Doc2Vec.load("d2v_trainned_allwords.model")

In [44]:
model_doc.most_similar(positive=['direito'])
#### Search Engine

KeyError: "word 'direito' not in vocabulary"

In [30]:
keywords = input('Search: ')

Search: acidente de trânsito


In [31]:
key1 = keywords.strip().lower().split()
key2 = [RSLPStemmer().stem(word.lower()) for word in key1]
#print('Choose words in appropriate format.')
key2

['acid', 'de', 'trânsit']

In [32]:
model_doc.most_similar(positive=[key2[0]])

KeyError: "word 'acid' not in vocabulary"

In [None]:
tokens = 'direito'
new_vector = model_doc.infer_vector(tokens.split() ,alpha=0.001 ,steps = 5)
tagsim = model_doc.docvecs.most_similar([new_vector])[0]

docsim = words[tagsim['acordo']]

print("Document : ", data[docsim.original_number], "\n")

In [105]:
words

0       [acordo, despacho, autos, presente, caso, deci...
1       [presente, reclamação, conferência, recurso, a...
2       [estabelecimento, hospital, tratamento, assist...
3       [formação, excepcional, fundamento, art, cpc, ...
4       [artigo, ccivil, valor, bens, data, abertura, ...
                              ...                        
9843    [recurso, excepcional, stj, questão, direito, ...
9844    [senhorio, dever, inquilino, gozo, prédio, fin...
9845    [matéria, facto, stj, fundamentos, parte, fina...
9846    [sede, saneador, sentença, juiz, termos, art, ...
9847    [relevância, exclusão, culpa, rés, convicção, ...
Name: main, Length: 9848, dtype: object

In [None]:
new_vector = model.infer_vector(tokens.split() ,alpha=0.001 ,steps = 5)
sims = model.docvecs.most_similar([new_vector], topn=model.docvecs.count)

print("Most : " , data[alldocs[sims[0][0]].original_number], "\n") 
print("Median : " , data[alldocs[sims[17000][0]].original_number], "\n")
print("Least : " , data[alldocs[sims[-1][0]].original_number])