Skip to content

Commit

Permalink
* the format of hOCR changed somehow such that instead of word ID's b…
Browse files Browse the repository at this point in the history
…eing "word_[wordID]" it is now "word_1_[wordID]". i changed the beautifulsoup queries to reflect this change in word id convention.
  • Loading branch information
Bryan Tarpley committed Oct 27, 2016
1 parent a854e76 commit aa3884f
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 9 deletions.
2 changes: 1 addition & 1 deletion lib/MultiColumnSkew/parseOCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parseHOCR(fileName):
splitList = val["title"].split(';')
x1,y1,x2,y2 = splitList[0].split('bbox ')[1].split(' ')
w_conf = float(splitList[1].split('x_wconf ')[1])/100
word_id = int(val["id"].split("word_")[1])
word_id = int(val["id"].split("word_1_")[1])
#print val
lenStr=[unicode(s).encode("utf-8") for s in val.contents];
if len(lenStr)!=0:
Expand Down
28 changes: 20 additions & 8 deletions lib/denoise/deNoise_Post.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from math import pi, log
from scipy import fft, ifft
from scipy.optimize import curve_fit
import os;
import os
import traceback
#from memory_profiler import profile

#@profile
Expand Down Expand Up @@ -850,7 +851,7 @@ def nnSim(normDat,IW,LW,b1,b2):
splitList = val["title"].split(';')
x1,y1,x2,y2 = splitList[0].split('bbox ')[1].split(' ')
w_conf = float(splitList[1].split('x_wconf ')[1])/100
word_id = int(val["id"].split("word_")[1])
word_id = int(val["id"].split("word_1_")[1])
wordInfoNon_Scaled[word_id-1,:]= np.array([word_id, int(x1),int(y1),int(x2),int(y2),abs(int(y2) - int(y1)), abs(int(x2) - int(x1)),w_conf])
x1 = (int(x1)/pageWidth)
y1 = (1 - (int(y1)/pageHeight))
Expand Down Expand Up @@ -1256,7 +1257,18 @@ def nnSim(normDat,IW,LW,b1,b2):
confVal[indactualIndexToConsider[finalFilterTemp==1]] = confValTemp[finalFilterTemp==1]
confVal[indactualIndexToConsider[finalFilterTemp==-1]] = confValTemp[finalFilterTemp==-1]
if np.size(predictedLabelMachineLearning)>0:
predictedLabelMachineLearning = np.append(predictedLabelMachineLearning,indactualIndexToConsider[finalFilterTemp==1],1)
try:
predictedLabelMachineLearning = np.append(predictedLabelMachineLearning,indactualIndexToConsider[finalFilterTemp==1],1)
except:
print('Error on line 1261:')
print(traceback.format_exc())
print('Value of predictedLabelMachineLearning:')
print(predictedLabelMachineLearning)
print('Value of indactualIndexToConsider:')
print(indactualIndexToConsider)
print('Value of finalFilterTemp:')
print(finalFilterTemp)

else:
predictedLabelMachineLearning = indactualIndexToConsider[finalFilterTemp==1]

Expand All @@ -1275,11 +1287,11 @@ def nnSim(normDat,IW,LW,b1,b2):
confValTemp = confVal[word_id];
if MLFilter[word_id]==1:
confValTemp = 1.0 - confVal[word_id];
temp = soup.find("span",class_="ocrx_word",id="word_%d"%(wordInfo[word_id,0]))
temp = soup.find("span",class_="ocrx_word",id="word_1_%d"%(wordInfo[word_id,0]))
temp1 = temp['title'].split(';');
temp['title'] = "%s;%s; pred %d; noiseConf %.4f"%(temp1[0],temp1[1 ],MLFilter[word_id],confValTemp)
if MLFilter[word_id]==0:
temp1 = soup1.find("span",class_="ocrx_word",id="word_%d"%(wordInfo[word_id,0]))
temp1 = soup1.find("span",class_="ocrx_word",id="word_1_%d"%(wordInfo[word_id,0]))
temp1.extract()

# insert noisemeasure
Expand All @@ -1296,7 +1308,7 @@ def nnSim(normDat,IW,LW,b1,b2):
# toDel = np.ix_(MLFilter==0)[0]
# soup1 = bs4.BeautifulSoup(open(fileName1))
# for word_id in range(0,np.size(toDel)):
# temp = soup1.find("span",class_="ocrx_word",id="word_%d"%(wordInfo[toDel[word_id],0]))
# temp = soup1.find("span",class_="ocrx_word",id="word_1_%d"%(wordInfo[toDel[word_id],0]))
# temp.extract()
# insert noisemeasure
tempNoiseM = soup1["title"];
Expand All @@ -1309,7 +1321,7 @@ def nnSim(normDat,IW,LW,b1,b2):
else:
#make two hOCR files
for word_id in range(0,np.size(wordInfo[:,0])):
temp = soup.find("span",class_="ocrx_word",id="word_%d"%(wordInfo[word_id,0]))
temp = soup.find("span",class_="ocrx_word",id="word_1_%d"%(wordInfo[word_id,0]))
temp1 = temp['title'].split(';');
temp['title'] = "%s;%s; pred %d; noiseConf %.4f"%(temp1[0],temp1[1 ],-1,-1)

Expand Down Expand Up @@ -1340,7 +1352,7 @@ def nnSim(normDat,IW,LW,b1,b2):
else:
#make two hOCR files
for word_id in range(0,np.size(wordInfo[:,0])):
temp = soup.find("span",class_="ocrx_word",id="word_%d"%(wordInfo[word_id,0]))
temp = soup.find("span",class_="ocrx_word",id="word_1_%d"%(wordInfo[word_id,0]))
temp1 = temp['title'].split(';');
temp['title'] = "%s;%s; pred %d; noiseConf %.4f"%(temp1[0],temp1[1 ],-1,-1)

Expand Down

0 comments on commit aa3884f

Please sign in to comment.