In [83]:
#import modules
import os.path
import numpy as np
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.rslp import RSLPStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from unidecode import unidecode

In [2]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [3]:
def load_data(path, file_name):
    documents_list = []
    with open(os.path.join(path, file_name) ,"r", encoding='utf8') as f:
        for line in f.readlines():
            text = line.strip()
            if len(text) > 0:
                documents_list.append(text)
    
    return documents_list

In [4]:
def preprocess_data(doc_set):
    tokenizer = RegexpTokenizer(r'\w+')
    stop = set(stopwords.words('portuguese'))
    stemmer = RSLPStemmer()
    
    texts = []
    for i in doc_set:
        raw = unidecode(i).lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in stop]
        stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    
    return texts

In [5]:
def prepare_corpus(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]

    model = TfidfModel(corpus)

    doc_term_matrix = [model[doc] for doc in corpus]

    return dictionary, doc_term_matrix

In [6]:
def create_gensim_lsa_model(doc_clean, number_of_topics):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)

    return lsamodel

In [9]:
def get_topics(model, prep_text):
    topics = model[prep_text]
    total_topics = []
    for topic in topics:
        if len(topic) > 0:
            total_topics.append([t[1] for t in topic])
    
    return np.array(total_topics)

In [7]:
gt_doc = load_data("C:/Users/bruno/Documents/Projetos/TCC/ground-truth/1-facil", "correio da lavoura_1484_agosto de 1945-1.txt")
ocr_doc = load_data("C:/Users/bruno/Documents/Projetos/TCC/workspace/correio-da-lavoura-ocr/output/correio da lavoura_1484_agosto de 1945/page0001-1", "processed.txt")
clean_text = preprocess_data(gt_doc + ocr_doc)
model = create_gensim_lsa_model(clean_text, number_of_topics=30)

In [8]:
gt_clean = preprocess_data(gt_doc)
_, gt_prep = prepare_corpus(gt_clean)
gt_topics = model[gt_prep]

ocr_clean = preprocess_data(ocr_doc)
_, ocr_prep = prepare_corpus(ocr_clean)
ocr_topics = model[ocr_prep]

In [10]:
gt_topics = get_topics(model, gt_prep)
ocr_topics = get_topics(model, ocr_prep)
np.mean(cosine_similarity(gt_topics, ocr_topics))

0.05207200169035882

In [77]:
gt_doc = load_data("C:/Users/bruno/Documents/Projetos/TCC/ground-truth/1-facil", "correio da lavoura_1484_agosto de 1945-1.txt")
ocr_doc = load_data("C:/Users/bruno/Documents/Projetos/TCC/workspace/correio-da-lavoura-ocr/output/correio da lavoura_1484_agosto de 1945/page0001-2", "base.txt")
clean_text = preprocess_data(gt_doc + ocr_doc)
model = Word2Vec(clean_text)

In [78]:
gt_clean = preprocess_data(gt_doc)
gt_flat = [item for sublist in gt_clean for item in sublist]
gt_vector = np.array([model.wv[word] for word in gt_flat if word in model.wv])
gt_vector.shape

(306, 100)

In [79]:
ocr_clean = preprocess_data(ocr_doc)
ocr_flat = [item for sublist in ocr_clean for item in sublist]
ocr_vector = np.array([model.wv[word] for word in ocr_flat if word in model.wv])
ocr_vector.shape

(213, 100)

In [80]:
def get_vector(model, s):
    return np.sum(np.array([model[i] for i in s if i in model]), axis=0)

In [81]:
gt_vector = get_vector(model.wv, gt_flat)
ocr_vector = get_vector(model.wv, ocr_flat)
gt_vector.shape, ocr_vector.shape

((100,), (100,))

In [82]:
np.mean(cosine_similarity(gt_vector.reshape(1,-1), ocr_vector.reshape(1,-1)))

0.55814326

In [None]:
# treinar Word2Vec em todos os ground-truth

In [86]:
from nltk.corpus import mac_morpho
nltk.download('mac_morpho')
len(mac_morpho.paras())

[nltk_data] Downloading package mac_morpho to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!


51397