In [2]:
import os
import pandas as pd
from evaluate_quality import char_accuracy

In [3]:
documents = {
    'output/correio da lavoura_1484_agosto de 1945/page0001-1': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/1-facil/correio da lavoura_1484_agosto de 1945-1.txt',
    'output/correio da lavoura_1484_agosto de 1945/page0001-2': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/1-facil/correio da lavoura_1484_agosto de 1945-2.txt',
    'output/correio da lavoura_1484_agosto de 1945/page0001-3': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/1-facil/correio da lavoura_1484_agosto de 1945-3.txt',
    'output/correio da lavoura_10_maio_1917/page0001-1': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/2-media/correio da lavoura_10_maio_1917-1.txt',
    'output/correio da lavoura_10_maio_1917/page0001-2': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/2-media/correio da lavoura_10_maio_1917-2.txt',
    'output/correio da lavoura_459_dezembro_1925/page0001-1': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/2-media/correio da lavoura_459_dezembro_1925-1.txt',
    'output/correio da lavoura_10_maio_1917/page0001-3': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/3-dificil/correio da lavoura_10_maio_1917-3.txt',
    'output/correio da lavoura_52_marco_1918/page0001-1': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/3-dificil/correio da lavoura_52_marco_1918-1.txt',
    'output/correio da lavoura_52_marco_1918/page0001-2': 'C:/Users/bruno/Documents/Projetos/TCC/ground-truth/3-dificil/correio da lavoura_52_marco_1918-2.txt'
}

df = pd.DataFrame(columns=['path', 'base', 'proc'])
df.head()

Unnamed: 0,path,base,proc


## Char Accuracy

In [3]:
def read_and_eval(ocr_path, gt_path, is_base):
    file = 'base.txt' if is_base else 'proc.txt'
    ocr_path = os.path.join(path, file)
    with open(ocr_path, 'r', encoding='utf8') as f:
        ocr = f.read()
    with open(gt_path, 'r', encoding='utf8') as f:
        gt = f.read()
    
    return char_accuracy(gt, ocr)

In [4]:
for path in documents:
    print(path, 'base')
    base = read_and_eval(path, documents[path], True)
    print(path, 'proc')
    proc = read_and_eval(path, documents[path], False)
    print({ 'path': path[26:], 'base': base, 'proc': proc })
    df = df.append({ 'path': path[26:], 'base': base, 'proc': proc }, ignore_index=True)

df

output/correio da lavoura_1484_agosto de 1945/page0001-1 base
output/correio da lavoura_1484_agosto de 1945/page0001-1 proc
{'path': '1484_agosto de 1945/page0001-1', 'base': 0.7431175766504006, 'proc': 0.7312402172912255}
output/correio da lavoura_1484_agosto de 1945/page0001-2 base
output/correio da lavoura_1484_agosto de 1945/page0001-2 proc
{'path': '1484_agosto de 1945/page0001-2', 'base': 0.5419865642994242, 'proc': 0.6261996161228407}
output/correio da lavoura_1484_agosto de 1945/page0001-3 base
output/correio da lavoura_1484_agosto de 1945/page0001-3 proc
{'path': '1484_agosto de 1945/page0001-3', 'base': 0.26311801477552565, 'proc': 0.714150407274105}
output/correio da lavoura_10_maio_1917/page0001-1 base
output/correio da lavoura_10_maio_1917/page0001-1 proc
{'path': '10_maio_1917/page0001-1', 'base': 0, 'proc': 0.6405349036927984}
output/correio da lavoura_10_maio_1917/page0001-2 base
output/correio da lavoura_10_maio_1917/page0001-2 proc
{'path': '10_maio_1917/page0001-2', 

Unnamed: 0,path,base,proc
0,1484_agosto de 1945/page0001-1,0.743118,0.73124
1,1484_agosto de 1945/page0001-2,0.541987,0.6262
2,1484_agosto de 1945/page0001-3,0.263118,0.71415
3,10_maio_1917/page0001-1,0.0,0.640535
4,10_maio_1917/page0001-2,0.0,0.30656
5,459_dezembro_1925/page0001-1,0.607186,0.622016
6,10_maio_1917/page0001-3,0.597605,0.275117
7,52_marco_1918/page0001-1,0.0,0.58381
8,52_marco_1918/page0001-2,0.753716,0.509479


## Semantic Similarity

In [4]:
import numpy as np
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import mac_morpho
from nltk.stem.rslp import RSLPStemmer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

In [5]:
def load_data(path):
    documents_list = []
    with open(path ,"r", encoding='utf8') as f:
        for line in f.readlines():
            text = line.strip()
            if len(text) > 0:
                documents_list.append(text)
    
    return documents_list

In [6]:
def preprocess_data(doc_set):
    tokenizer = RegexpTokenizer(r'\w+')
    stop = set(stopwords.words('portuguese'))
    stemmer = RSLPStemmer()
    
    texts = []
    for i in doc_set:
        raw = unidecode(i).lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in stop]
        stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    
    return texts

In [7]:
corpus = []
for ocr_path in documents:
    gt_path = documents[ocr_path]
    doc = load_data(gt_path)
    corpus.extend(preprocess_data(doc))

model = Word2Vec(corpus)

In [8]:
def load_preprocess_and_flatten(path):
    data = load_data(path)
    clean = preprocess_data(data)
    return [item for sublist in clean for item in sublist]

In [9]:
def get_vector(model, s):
    return np.mean(np.array([model[i] for i in s if i in model]), axis=0)

In [10]:
df_sim = pd.DataFrame(columns=['path', 'base', 'proc'])
for ocr_path in documents:
    gt_path = documents[ocr_path]
    gt_flat = load_preprocess_and_flatten(gt_path)

    base_path = os.path.join(ocr_path, 'base.txt')
    base_flat = load_preprocess_and_flatten(base_path)

    proc_path = os.path.join(ocr_path, 'proc.txt')
    proc_flat = load_preprocess_and_flatten(proc_path)
    
    gt_vector = get_vector(model.wv, gt_flat)
    base_vector = get_vector(model.wv, base_flat)
    proc_vector = get_vector(model.wv, proc_flat)

    base_similarity = np.mean(cosine_similarity(gt_vector.reshape(1,-1), base_vector.reshape(1,-1))) if base_vector.reshape(1,-1).shape[1] == 100 else np.nan
    proc_similarity = np.mean(cosine_similarity(gt_vector.reshape(1,-1), proc_vector.reshape(1,-1))) if proc_vector.reshape(1,-1).shape[1] == 100 else np.nan
    df_sim = df_sim.append({ 'path': ocr_path[26:], 'base': base_similarity, 'proc': proc_similarity }, ignore_index=True)

df_sim

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,path,base,proc
0,1484_agosto de 1945/page0001-1,0.999543,0.999644
1,1484_agosto de 1945/page0001-2,0.998447,0.9987
2,1484_agosto de 1945/page0001-3,0.998253,0.998988
3,10_maio_1917/page0001-1,,0.998723
4,10_maio_1917/page0001-2,,0.998858
5,459_dezembro_1925/page0001-1,0.999168,0.999184
6,10_maio_1917/page0001-3,0.997555,0.987752
7,52_marco_1918/page0001-1,,0.998301
8,52_marco_1918/page0001-2,0.99898,0.999403


In [11]:
corpus = []
for ocr_path in documents:
    gt_path = documents[ocr_path]
    doc = load_data(gt_path)
    clean = preprocess_data(doc)
    clean_flat = [item for sublist in clean for item in sublist]
    corpus.append(TaggedDocument(clean_flat, [ocr_path]))

In [12]:
paras = mac_morpho.paras()
mac_morpho_corpus = []
for i in range(len(paras)):
    mac_morpho_corpus.append([item for sublist in paras[i] for item in sublist])
# mac_morpho_corpus[41]

['Augusto',
 'Ribeiro',
 'Garcia',
 'é',
 'jornalista',
 ',',
 'advogado',
 'agrarista',
 'e',
 'membro',
 'de',
 'o',
 'Instituto',
 'Paulista',
 'de',
 'Direito',
 'Agrário']

In [13]:
for i, doc in enumerate(mac_morpho_corpus):
    clean = preprocess_data(doc)
    clean_flat = [item for sublist in clean for item in sublist]
    corpus.append(TaggedDocument(clean_flat, [f'mac_morpho{i}']))

In [14]:
d2v = Doc2Vec(vector_size=100, min_count=2, epochs=10)
d2v.build_vocab(corpus)
d2v.train(corpus, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [15]:
df_sim = pd.DataFrame(columns=['path', 'base', 'proc'])
for ocr_path in documents:
    gt_path = documents[ocr_path]
    gt_flat = load_preprocess_and_flatten(gt_path)

    base_path = os.path.join(ocr_path, 'base.txt')
    base_flat = load_preprocess_and_flatten(base_path)

    proc_path = os.path.join(ocr_path, 'proc.txt')
    proc_flat = load_preprocess_and_flatten(proc_path)
    
    # gt_vector = d2v.infer_vector(gt_flat)
    # base_vector = d2v.infer_vector(base_flat)
    # proc_vector = d2v.infer_vector(proc_flat)

    base_similarity = d2v.similarity_unseen_docs(gt_flat, base_flat)
    proc_similarity = d2v.similarity_unseen_docs(gt_flat, proc_flat)
    # base_similarity = np.mean(cosine_similarity(gt_vector.reshape(1,-1), base_vector.reshape(1,-1))) if base_vector.reshape(1,-1).shape[1] == 100 else np.nan
    # proc_similarity = np.mean(cosine_similarity(gt_vector.reshape(1,-1), proc_vector.reshape(1,-1))) if proc_vector.reshape(1,-1).shape[1] == 100 else np.nan
    df_sim = df_sim.append({ 'path': ocr_path[26:], 'base': base_similarity, 'proc': proc_similarity }, ignore_index=True)

df_sim

Unnamed: 0,path,base,proc
0,1484_agosto de 1945/page0001-1,0.964087,0.951811
1,1484_agosto de 1945/page0001-2,0.943627,0.940309
2,1484_agosto de 1945/page0001-3,0.867083,0.941844
3,10_maio_1917/page0001-1,0.113475,0.938927
4,10_maio_1917/page0001-2,0.1104,0.962401
5,459_dezembro_1925/page0001-1,0.927177,0.874901
6,10_maio_1917/page0001-3,0.959448,0.858907
7,52_marco_1918/page0001-1,-0.061419,0.889602
8,52_marco_1918/page0001-2,0.953332,0.954203
