diff --git a/unlvtests/README.md b/unlvtests/README.md index 98ef8c2588..4522ab5bcb 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz tar xzvf ~/isri-downloads/mag.3B.tar.gz tar xzvf ~/isri-downloads/news.3B.tar.gz tar xzvf ~/isri-downloads/spn.3B.tar.gz +mkdir -p stopwords +cd stopwords +wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt ``` +Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt +wordacc uses a space delimited stopwords file, not line delimited. Edit *~/ISRI-OCRtk/spn.3B/pages* delete the line containing the following imagename as it crashes tesseract. - 7733_005.3B.tif ### Step 3: Download the modified ISRI toolkit, make and install the tools : @@ -52,10 +56,10 @@ sudo make install ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract. -### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language: +### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir: ``` -unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng +unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast ``` and go to the gym, have lunch etc. It takes a while to run. @@ -66,5 +70,23 @@ report and comparison with the 1995 results. ### Step 7: run the test for Spanish. ``` -unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa +unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast ``` + +#### Notes from Nick White regarding wordacc + +If you just want to remove all lines which have 100% recognition, +you can add a 'awk' command like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' +results.txt + +or if you've already got a results file you want to change, you can do this: + +awk '$3 != 100 {print $0}' results.txt newresults.txt + +If you only want the last sections where things are broken down by +word, you can add a sed commend, like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ +!d' | awk '$3 != 100 {print $0}' results.txt diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh index 560c73f7cd..be380b371b 100755 --- a/unlvtests/counttestset.sh +++ b/unlvtests/counttestset.sh @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 1 ] +if [ $# -ne 2 ] then - echo "Usage:$0 pagesfile" + echo "Usage:$0 pagesfile langcode" exit 1 fi if [ ! -d src/api ] @@ -27,6 +27,7 @@ then fi pages=$1 +langcode=$2 imdir=${pages%/pages} setname=${imdir##*/} @@ -45,15 +46,22 @@ do fi #echo "$srcdir/$page.tif" # Count character errors. - ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc" + iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text" + iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text" + ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc" accfiles="$accfiles $resdir/$page.acc" # Count word errors. - ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa" + #langcode should be either eng or spa + if [ "$langcode" = "eng" ] + then + ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" + else + cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" + ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" + fi wafiles="$wafiles $resdir/$page.wa" done <"$pages" -#echo "$accfiles" -#echo "$wafiles" - accsum $accfiles >"unlvtests/results/$setname.characc" wordaccsum $wafiles >"unlvtests/results/$setname.wordacc" + diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum deleted file mode 100644 index 35060967f8..0000000000 --- a/unlvtests/reports/1995.spn.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh index 18ef3929ff..5cdf5e85fa 100755 --- a/unlvtests/runalltests.sh +++ b/unlvtests/runalltests.sh @@ -1,6 +1,6 @@ #!/bin/bash # File: runalltests.sh -# Description: Script to run a set of UNLV test sets. +# Description: Script to run a set of UNLV test sets for English. # Author: Ray Smith # Created: Thu Jun 14 08:21:01 PDT 2007 # @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 4 ] +if [ $# -ne 3 ] then - echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang " + echo "Usage:$0 unlv-data-dir version-id tessdata-dir" exit 1 fi if [ ! -d src/api ] @@ -31,7 +31,6 @@ then exit 1 fi tessdata=$3 -lang=$4 #deltapc new old calculates the %change from old to new deltapc() { @@ -62,19 +61,8 @@ then fi rdir=unlvtests/reports -if [ "$lang" = "eng" ] -then - testsets="bus.3B doe3.3B mag.3B news.3B" - #testsets="bus.3B" -else - if [ "$lang" = "spa" ] - then - testsets="spn.3B" - else - echo "Language has to be eng or spa" - exit 1 - fi -fi +testsets="bus.3B doe3.3B mag.3B news.3B" +#testsets="bus.3B" totalerrs=0 totalwerrs=0 @@ -87,7 +75,7 @@ do if [ -r "$imdir/$set/pages" ] then # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng" # Count the errors on all the pages. $bindir/counttestset.sh "$imdir/$set/pages" # Get the old character word and nonstop word errors. diff --git a/unlvtests/runalltests_spa.sh b/unlvtests/runalltests_spa.sh new file mode 100755 index 0000000000..a6e218bbc5 --- /dev/null +++ b/unlvtests/runalltests_spa.sh @@ -0,0 +1,109 @@ +#!/bin/bash +############################################################################## +# File: runalltests_spa.sh +# Description: Script to run a set of UNLV test sets for Spanish. +# based on runalltests.sh by Ray Smith +# Author: Shree Devi Kumar +# Created: June 09, 2018 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +if [ $# -ne 3 ] +then + echo "Usage:$0 unlv-data-dir version-id tessdata-dir" + exit 1 +fi +if [ ! -d src/api ] +then + echo "Run $0 from the tesseract-ocr root directory!" + exit 1 +fi +if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ] +then + echo "Please build tesseract before running $0" + exit 1 +fi +tessdata=$3 +lang=$4 + +#timesum computes the total cpu time +timesum() { +awk ' BEGIN { +total = 0.0; +} +{ + total += $2; +} +END { + printf("%.2f\n", total); +}' "$1" +} + +imdir="$1" +vid="$2" +bindir=${0%/*} +if [ "$bindir" = "$0" ] +then + bindir="./" +fi +rdir=unlvtests/reports + +testsets="spn.3B" + +totalerrs=0 +totalwerrs=0 +totalnswerrs=0 +for set in $testsets +do + if [ -r "$imdir/$set/pages" ] + then + # Run tesseract on all the pages. + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa" + # Count the errors on all the pages. + $bindir/counttestset.sh "$imdir/$set/pages" "spa" + # Get the new character word and nonstop word errors and accuracy. + cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c10-17 |tr -d '[:blank:]') + nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c19-26 |tr -d '[:blank:]') + +sumfile=$rdir/$vid.$set.sum + if [ -r "unlvtests/results/$set.times" ] + then + total_time=$(timesum "unlvtests/results/$set.times") + if [ -r "unlvtests/results/prev/$set.times" ] + then + paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | + awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" + fi + else + total_time='0.0' + fi + echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ + NonStopWordErrors Accuracy TimeTaken">"$sumfile" + echo "$vid $set $cherrs $chacc $wderrs $wdacc\ + $nswderrs $nswdacc ${total_time}s" >>"$sumfile" + fi +done + +cat "$rdir/$vid".*.sum >"$rdir/$vid".summary + +mv "$rdir/$vid".*.sum unlvtests/results/ +cat "$rdir/$vid".summary