Skip to content

Commit

Permalink
update k2pdfopt to version 1.6.3
Browse files Browse the repository at this point in the history
  • Loading branch information
chrox committed Dec 21, 2012
1 parent 2e18531 commit f31e2a8
Show file tree
Hide file tree
Showing 21 changed files with 1,375 additions and 223 deletions.
91 changes: 85 additions & 6 deletions k2pdfoptlib/bmpregion.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ int bmpregion_row_black_count(BMPREGION *region,int r0)
}


int bmpregion_col_black_count(BMPREGION *region,int c0)

{
unsigned char *p;
int i,nr,c,bw;

bw=bmp_bytewidth(region->bmp8);
p=bmp_rowptr_from_top(region->bmp8,region->r1)+c0;
nr=region->r2-region->r1+1;
for (c=i=0;i<nr;i++,p+=bw)
if (p[0]<region->bgcolor)
c++;
return(c);
}


#if (defined(WILLUSDEBUGX) || defined(WILLUSDEBUG))
void bmpregion_write(BMPREGION *region,char *filename)

Expand Down Expand Up @@ -115,21 +131,84 @@ void bmpregion_row_histogram(BMPREGION *region)
/*
** Return 0 if there are dark pixels in the region. NZ otherwise.
*/
int bmpregion_is_clear(BMPREGION *region,int *row_black_count,double gt_in)
int bmpregion_is_clear(BMPREGION *region,int *row_black_count,int *col_black_count,
int *col_pix_count,int rpc,double gt_in)

{
int r,c,nc,pt;
int nr,nc,r,c,pt,mindim;

pt=(int)(gt_in*region->dpi*(region->c2-region->c1+1)+.5);
if (pt<0)
pt=0;
/*
** Fast way to count dark pixels, but requires big array
*/
if (col_pix_count!=NULL && rpc>0)
{
int i;
if (region->r1>0)
for (c=0,i=region->c1;i<=region->c2;i++)
{
c += col_pix_count[i*rpc+region->r2] - col_pix_count[i*rpc+(region->r1-1)];
if (c>pt)
return(0);
}
else
for (c=0,i=region->c1;i<=region->c2;i++)
{
c += col_pix_count[i*rpc+region->r2];
if (c>pt)
return(0);
}
return(pt<=0 ? 1 : 1+(int)10*c/pt);
}

/*
** row_black_count[] doesn't necessarily match up to this particular region's columns.
** So if row_black_count[] == 0, the row is clear, otherwise it has to be counted.
** because the columns are a subset.
*/
/* nr=region->r2-region->r1+1; */
nr=region->r2-region->r1+1;
nc=region->c2-region->c1+1;
pt=(int)(gt_in*region->dpi*nc+.5);
if (pt<0)
pt=0;
mindim = nr>nc ? nc : nr;
if (mindim > 5)
{
int i,bcc,brc;

/*
** Determine most efficient way to see if the shaft is clear
*/
for (bcc=0,i=region->c1;i<=region->c2;i++)
if (col_black_count[i]==0)
bcc++;
for (brc=0,i=region->r1;i<=region->r2;i++)
if (row_black_count[i]==0)
brc++;

/*
** Count dark pixels by columns
*/
if (bcc*(region->r2-region->r1+1) > 2*brc*(region->c2-region->c1+1))
{
int col;

for (c=0,col=region->c1;col<=region->c2;col++)
{
if (col<0 || col>=region->bmp8->width)
continue;
if (col_black_count[col]==0)
continue;
c+=bmpregion_col_black_count(region,col);
if (c>pt)
return(0);
}
return(pt<=0 ? 1 : 1+(int)10*c/pt);
}
}

/*
** Count dark pixels by rows
*/
for (c=0,r=region->r1;r<=region->r2;r++)
{
if (r<0 || r>=region->bmp8->height)
Expand Down
12 changes: 11 additions & 1 deletion k2pdfoptlib/k2ocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ void k2ocr_init(K2PDFOPT_SETTINGS *k2settings)
{
#endif
aprintf(TTEXT_BOLD);
k2settings->ocrtess_status=ocrtess_init(NULL,NULL,3,stdout);
k2settings->ocrtess_status=ocrtess_init(NULL,
k2settings->dst_ocr_lang[0]=='\0'?NULL:k2settings->dst_ocr_lang,
3,stdout);
aprintf(TTEXT_NORMAL);
if (k2settings->ocrtess_status)
aprintf(TTEXT_WARN "Could not find Tesseract data" TTEXT_NORMAL " (env var = TESSDATA_PREFIX).\nUsing GOCR v0.49.\n\n");
Expand Down Expand Up @@ -237,7 +239,15 @@ fflush(stdout);
#endif
#if (WILLUSDEBUGX & 32)
if (wordbuf[0]!='\0')
{
char filename[256];
FILE *f;
sprintf(filename,"word%04d.txt",counter);
f=fopen(filename,"wb");
fprintf(f,"%s\n",wordbuf);
fclose(f);
printf("%s\n",wordbuf);
}
else
printf("(OCR failed)\n");
counter++;
Expand Down
10 changes: 10 additions & 0 deletions k2pdfoptlib/k2parsecmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,16 @@ int parse_cmd_args(K2PDFOPT_SETTINGS *k2settings,STRBUF *env,STRBUF *cmdline,
k2settings->mark_corners=(cl->cmdarg[3]=='-') ? 0 : 1;
continue;
}
if (!stricmp(cl->cmdarg,"-ocrlang") || !stricmp(cl->cmdarg,"-l"))
{
if (cmdlineinput_next(cl)==NULL)
break;
#ifdef HAVE_TESSERACT_LIB
strncpy(k2settings->dst_ocr_lang,cl->cmdarg,15);
k2settings->dst_ocr_lang[15]='\0';
#endif
continue;
}
if (!stricmp(cl->cmdarg,"-ocrvis"))
{
if (cmdlineinput_next(cl)==NULL)
Expand Down
11 changes: 8 additions & 3 deletions k2pdfoptlib/k2pdfopt.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@
** 16 = hyphens
** 32 = OCR
** 64 = crop boxes
** 128 = column divider
**
*/
// #define WILLUSDEBUGX 64
// #define WILLUSDEBUGX 32
// #define WILLUSDEBUG

#include <stdio.h>
Expand All @@ -40,7 +41,6 @@
#include <ctype.h>
#include <stdarg.h>
#include <math.h>
//#include <constant.h>
#include <willus.h>

/* Uncomment below if compiling for Kindle PDF Viewer */
Expand Down Expand Up @@ -148,6 +148,9 @@ typedef struct
/* OCR */
#ifdef HAVE_OCR_LIB
int dst_ocr;
#ifdef HAVE_TESSERACT_LIB
char dst_ocr_lang[16];
#endif
int dst_ocr_visibility_flags;
double ocr_max_height_inches;
OCRWORDS dst_ocrwords;
Expand Down Expand Up @@ -407,11 +410,13 @@ int get_ttyrows(void);

/* bmpregion.c */
int bmpregion_row_black_count(BMPREGION *region,int r0);
int bmpregion_col_black_count(BMPREGION *region,int c0);
#if (defined(WILLUSDEBUGX) || defined(WILLUSDEBUG))
void bmpregion_write(BMPREGION *region,char *filename);
#endif
void bmpregion_row_histogram(BMPREGION *region);
int bmpregion_is_clear(BMPREGION *region,int *row_is_clear,double gt_in);
int bmpregion_is_clear(BMPREGION *region,int *row_black_count,int *col_black_count,
int *pixel_count_array,int rpc,double gt_in);
void bmpregion_trim_to_crop_margins(BMPREGION *region,K2PDFOPT_SETTINGS *k2settings);
int bmpregion_column_height_and_gap_test(BMPREGION *column,BMPREGION *region,
K2PDFOPT_SETTINGS *k2settings,
Expand Down

0 comments on commit f31e2a8

Please sign in to comment.