In [None]:
# The objective of this notebook is to extract features from the information source 2,
# which are the technical drawings pdf files located in the local repository
# devis_path='C:/..../devis', and to store those features as npz files
# in a directory called pdf_path='C:/..../pdf_data'. 

In [None]:
# Import necessary libraries
import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 350)
import numpy as np
import sys
import glob
import os
import scipy.stats as stats
from shutil import copyfile
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFInfoNotInstalledError,PDFPageCountError,PDFSyntaxError
from PIL import Image, ImageDraw
import cv2
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

In [None]:
# Creating variable for path name to the extracted data
pdf_path='C:/..../pdf data'

# Store this variable for usage in other notebooks
%store pdf_path

#Getting the stored data back from the original notebooks where the data is generated
%store -r devis_path
%store -r cpst_path

#Checking the variables acctive in the notebook
%whos

In [None]:
# To run this notebook the setup of the credentials must have been performed in PS before opening the notebook
#!$env:GOOGLE_APPLICATION_CREDENTIALS='....\my-project-vision-servaccount.json'

In [None]:
# Function to check the API authentification and list the available folders in google storage

def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)
    
implicit()

In [None]:
# Function to detect text in images with Google vision

def detect_text(path):
    """Detects text in the file."""
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    
    response = client.text_detection(image=image)
    texts = response.text_annotations
    ##print('Texts:')
    
    text_descriptions=[]
    text_bounds=[]
    
    for text in texts:

        text_descriptions.append(text.description)
        #print('\n"{}"'.format(text.description))

        vertices = (['({},{})'.format(vertex.x, vertex.y)
                    for vertex in text.bounding_poly.vertices])

        text_bounds.append(','.join(vertices))
        #print('bounds: {}'.format(','.join(vertices)))

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    return text_descriptions, text_bounds

In [None]:
# Context manager for reditrecting stdout/err to files

class redirect_output(object):
    """context manager for reditrecting stdout/err to files"""

    def __init__(self, stdout='', stderr=''):
        self.stdout = stdout
        self.stderr = stderr

    def __enter__(self):
        self.sys_stdout = sys.stdout
        self.sys_stderr = sys.stderr

        if self.stdout:
            sys.stdout = open(self.stdout, 'w')
        if self.stderr:
            if self.stderr == self.stdout:
                sys.stderr = sys.stdout
            else:
                sys.stderr = open(self.stderr, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.sys_stdout
        sys.stderr = self.sys_stderr

In [None]:
# Function to perform image preprocessing
# Not yet fully confirmed if this is needed when we use google vision text_detection:
# On the tests perform we do not notice a real detection improvement
# We keep it nevertheless

def image_preprocessing(filepath_in):
    
    # Getting the image path file
    dirname=os.path.dirname(filepath_in)
    file_root_name=os.path.splitext(os.path.basename(filepath_in))[0]
    file_ext_name=os.path.splitext(os.path.basename(filepath_in))[1]

    # Load image, grayscale, Otsu's threshold
    image = cv2.imread(filepath_in)
    #print(image.shape)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #print(gray.shape)
    
    # Blur
    blur = cv2.GaussianBlur(gray, (5,5), 5)
    #print(thresh.shape)
    
    thresh = 255 - cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    #print(thresh.shape)
    
    # Convert back to jpg image
    filepath_out = os.path.normpath(os.path.join(dirname,file_root_name+'_pre'+file_ext_name))
    cv2.imwrite(filepath_out, thresh)
    
    return filepath_out

In [None]:
# Function to convert pdf to image

dpi=600
#size=(None,1600)

def convert_pdf_to_image(filepath_in, path_out):
    
    outputs=[]
    
    # Generate an image out of the pdf file
    ims = convert_from_path(filepath_in, grayscale=True, output_folder=path_out,single_file=True,dpi=dpi,fmt='jpeg',
                            jpegopt={'quality': 100,'progressive': True,'optimize': True})
    
    # Prepare new file name for image with same name as pdf
    _,file=os.path.split(os.path.splitext(filepath_in)[0])
    print('file:',file)
    
    # Storing the image grayscale profile (first page)
    print('Shape of the image from pdf first page: ',np.shape(np.asarray(ims[0])))
    im0_stats=stats.describe(np.asarray(ims[0]).reshape(-1), bias=False, nan_policy='omit')
    print('image statistics: ',im0_stats)
    
    # Save new image as tiff, one image for each pdf page
    for i, im in enumerate(ims):
               
        # Saving image in the pdf folder
        im.save(path_out+'{}_page{}from{}.jpg'.format(file,i+1,len(ims)), format='JPEG')
        
        # Additional saving of image close to the original pdf file
        print(os.path.dirname(filepath_in)+'\\'+'{}_page{}from{}.jpg'.format(file,i+1,len(ims)))
        im.save(os.path.dirname(filepath_in)+'\\'+'{}_page{}from{}.jpg'.format(file,i+1,len(ims)), format='JPEG')    
        
        # return path of the new images
        filepath_out = path_out+'{}_page{}from{}.jpg'.format(file,i+1,len(ims))
        #filepath_out_1 = os.path.dirname(filepath_in)+'\\'+'{}_page{}from{}.jpg'.format(file,i+1,len(ims))
    
        outputs.append((filepath_out,'{}_page{}from{}.jpg'.format(file,i+1,len(ims))))#,filepath_out_1))
    
    return outputs, im0_stats

In [None]:
# Geometric tolerancing according to:
# https://en.wikipedia.org/wiki/Geometric_dimensioning_and_tolerancing
# complemented by:
# https://docs.julialang.org/en/v1/manual/unicode-input/
# https://en.wikipedia.org/wiki/Mathematical_operators_and_symbols_in_Unicode#Characters_in_other_blocks

from unicodedata import *
import itertools

def get_symbols(group):
    print([(chr(ord(i)),name(i)) if len(i.encode())>2 else i for i in group],'\n')

groups=[]
    
# Geometric tolerancing reference chart (per ASME Y14.5 M-1982)
form_straightness=['\U000023E4']
form_flatness=['\U000023E5']
form_circularity=['\U000025CB','\U000025EF','\U000026AA','\U00002B55','\U0000039F']
form_cylindricity=['\U0000232D']
profile_profileofaline=['\U00002312']
profile_profileofasurface=['\U00002313']
orientation_perpendicularity=['\U000027C2','\U000022A5']
orientation_angularity=['\U00002220']
orientation_parallelism=['\U00002225','//']
orientation_symmetry=['\U0000232F']
location_position=['\U00002316','\U00002295','\U00002A01']
location_concentricity=['\U000025CE','\U0000229A','\U000029BE']
runout_circularrunout=['\U00002197']
runout_totalrunout=['\U00002330']

# Symbols used in a "feature control frame" to specify a feature's description, tolerance, modifier and datum references
freestate=['\U000024BB']
least_mat_cond=['\U000024C1']
max_mat_cond=['\U000024C2']
projected_tol_zone=['\U000024C5']
regardless_feature_size=['\U000024C8']
tangent_plane=['\U000024C9']
unequal_bilateral=['\U000024CA']
# Statistical tolerance (ST)
# continuous feature (CF)

# General tolerance according
# https://fr.wikipedia.org/wiki/Tol%C3%A9rances_g%C3%A9n%C3%A9rales
general_tolerance=['ISO.?2768']
general_tolerance_class=['[fmcv][HKL]']

# Holes shaft adjustments
holes=['H6','H7','H8','H9','H11']
def build_shafts():
    shafts=[]
    for i in ['c','d','e','f','g','h','js','k','m','p','s','u','x','z']:
        for j in ['5','6','7','8','9','11','14']:
            shafts.append(i+j)
    return shafts
shafts=build_shafts()

# Signs
signs=['\U000000B1','\+','-','\U000000AF','\U00000304','\U00000305','\U00000332','\U00000336','\U00002013','\U00002070',
       '\U0000207A','\U0000207B','\U0000207C','\U00002080','\U0000208A','\U0000208B','\U00002212','\U00002213',
       '\U00002795','\U00002796','\U00002A71','\U00002A72']

# Datums and datum references

#def build_ref():
#    L=[]
#    iterable=['A','B','C','D']#,'E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
#    for a in itertools.permutations(iterable, 1):
#        L.append(''.join(a))
#    for a in itertools.permutations(iterable, 2):
#        L.append(''.join(a))
#    for a in itertools.permutations(iterable, 3):
#        L.append(''.join(a))
#    return L
datum=['^\s[A-Z]{1,3}\s$']#build_ref()
boxes=['\U00002B1C','\U000020DE','\U000025A1','\U000025FB']
#datum feature symbol
#datum feature triangle
#datum references

# Surface finish
surface=['^Ra$','^Rz$','^N$']
surface_roughness=['^N1$', '^N2$', '^N3$', '^N4$', '^N5$', '^N6$', '^N6$', '^N8$', '^N9$', '^N10$', '^N11$', '^N12$',
                   '^Ra.?50$','^Ra.?25$','^Ra.?12.5$','^Ra.?6.3$','^Ra.?3.2$','^Ra.?1.6$','^Ra.?0.8$','^Ra.?0.4$','^Ra.?0.2$','^Ra.?0.1$']


# Geometrical features
diameter=['\U000000D8','\U000000F8','\U00000278','\U00002205','\U00002300']
degrees=['°','\U000000B0','\U000000BA','\U0000030A','\U000025E6','\U000026AC']
radius=['R\s?\d[.]?\d?']
unit_inch=['\d\"']

#Filtering numbers
float_pattern='[-+]?(\d+([.,]\d*)?|[.,]\d+)([eE][-+]?\d+)?'
date=['\d+[/-]\d+([/-]\d+)?']
invalid_float=['\d+\.\d+\.\d+','[a-zA-Z]+\d*.?\d?[a-zA-Z]+','^0\d']

groups=[form_straightness,
        form_flatness,
        form_circularity,
        form_cylindricity,
        profile_profileofaline,
        profile_profileofasurface,
        orientation_perpendicularity,
        orientation_angularity,
        orientation_parallelism,
        orientation_symmetry,
        location_position,
        location_concentricity,
        runout_circularrunout,
        runout_totalrunout,
        freestate,
        least_mat_cond,
        max_mat_cond,
        projected_tol_zone,
        regardless_feature_size,
        tangent_plane,
        unequal_bilateral,
        general_tolerance,general_tolerance_class,
        holes, shafts,
        signs,
        datum,boxes,
        surface,surface_roughness,
        diameter,degrees,radius,unit_inch, date, invalid_float]

groups_names=['form_straightness',
        'form_flatness',
        'form_circularity',
        'form_cylindricity',
        'profile_profileofaline',
        'profile_profileofasurface',
        'orientation_perpendicularity',
        'orientation_angularity',
        'orientation_parallelism',
        'orientation_symmetry',
        'location_position',
        'location_concentricity',
        'runout_circularrunout',
        'runout_totalrunout',
        'freestate',
        'least_mat_cond',
        'max_mat_cond',
        'projected_tol_zone',
        'regardless_feature_size',
        'tangent_plane',
        'unequal_bilateral',
        'general_tolerance','general_tolerance_class',
        'holes', 'shafts',
        'signs',
        'datum','boxes',
        'surface','surface_roughness',
        'diameter','degrees','radius','unit_inch','date','invalid_float']


# Display the symbols
for group in groups:
    get_symbols(group)

In [None]:
# Identify the list of pdf files to be processed

pdf_files = glob.glob(devis_path+'/**/*.pdf', recursive=True)
pdf_files[:15]

In [None]:
# Function to preprocess step files by batches

def get_batch(pdf_files,size):
    idx_shuff=np.arange(len(pdf_files))
    #np.random.shuffle(idx_shuff)
    for i in range(0,len(pdf_files),size):
        idx_batch=idx_shuff[i:i+size]
        yield [pdf_files[i] for i in idx_batch]

In [None]:
# Cleanup temporary *.pgm files 

def clean_pgm():
    
    test = os.listdir(pdf_path)

    for item in test:
        if item.endswith('.pgm'):
            os.remove(os.path.join(pdf_path, item))

In [None]:
# Starting iteration on the list of pdf files to be processed

# Working by batches of files
batch_size=1
batches=np.int(np.ceil(len(pdf_files)/batch_size))
print('batch size: ',batch_size)
print('batches: ',batches)
skipped_batch=0

for batch,pdf_files_batch in enumerate(get_batch(pdf_files, batch_size)):
    
    print('\n\nWorking on batch {} from {} ...'.format(batch+1,batches))
    print('Files in batch: ',pdf_files_batch)
    
    with redirect_output(pdf_path+'/output_{}_from_{}.txt'.format(batch+1,batches)):

        # Initialize the dataframe containing variables extracted from step files
        df_features=pd.DataFrame()

        try:

            for pdf_file in pdf_files_batch:

                # Initialize a diactionary of variables to be extracted
                print('\n\nInitializing dictionnary of variables ...')
                variables_dict={}

                # Define the input pdf location and path for images files to be generated
                print(pdf_file)
                filepath_in=os.path.abspath(pdf_file)
                print('filepath_in: ',filepath_in)
                path_out=pdf_path+'/'

                # Convert pdf to image
                print('Converting pdf to image ...')

                outputs,im0_stats=convert_pdf_to_image(filepath_in,path_out)


                print('Lenght of the output (i.e. number of pages in pdf) :', len(outputs))


                for o,output in enumerate(outputs):

                    print('Processing pdf page {} from {}'.format(o+1,len(outputs)))

                    filepath_out=output[0]

                    print(filepath_out)

                    # Add preprocessing to the image            
                    filepath_out=image_preprocessing(filepath_out)
                    print('Preprocessing the jpg image ...')

                    # Detect text in the image with google vision
                    print('Proceeding to OCR with google vision on filepath_out :',filepath_out)
                    texts, bounds = detect_text(filepath_out)

                    # Save image with bounding box
                    print('Adding bounding boxes on jpg image ...')
                    im=Image.open(filepath_out)
                    draw = ImageDraw.Draw(im)
                    for bound in bounds:
                        draw.polygon(list(eval(bound)), outline='black')
                    print('Saving jpg image with bounding boxes at loc1:',filepath_out)
                    im.save(filepath_out)

                    #Store copy of the file with labels in the original forlder structure for convenience
                    print('Saving jpg image with bounding boxes at loc2:',os.path.join(os.path.dirname(filepath_in),os.path.basename(filepath_out)))
                    im.save(os.path.join(os.path.dirname(filepath_in),os.path.basename(filepath_out)))
                    #copyfile(filepath_out, os.path.dirname(filepath_in))
                    #Image.open(filepath_out)

                    # Store text data in a dataframe
                    print('Storing text data in dataframe ...')
                    s_texts=pd.DataFrame(texts, columns=['original'])[1:]

                    # Add a column to indicate presence of digits
                    print('Adding columns indicating presence of digit in each sequence ...')
                    s_texts['has_digit'] = np.where(s_texts['original'].str.contains('\d'),1,0)

                    # Add columns corresponding to the capture groups with float pattern
                    print('Adding columns with floats detected in each sequence ...')
                    s_texts_digits=s_texts['original'].str.extractall(float_pattern).iloc[:,0].unstack(level=-1)
                    s_texts_digits_cols_indexes=list(np.arange(len(s_texts_digits.columns)))
                    s_texts=s_texts.join(s_texts_digits, how='left')

                    # Add columns corresponding to the capture of all special pattern previously defined
                    print('Adding columns with patterns detected in each sequence ...')
                    for i,group in enumerate(groups):
                        s_texts['{}'.format(group[0])] = np.where(s_texts['original'].str.contains('|'.join(group)),1,0)#'{}'.format()groups_names[i]

                    s_texts.rename(columns = {'ISO.?2768':'iso2768',
                                              '[fmcv][HKL]':'tol_class',
                                              'c5':'shafts',
                                              'H6':'holes',
                                              '^\s[A-Z]{1,3}\s$':'ref',
                                              '^Ra$':'Ra',
                                              '^N1$':'N',
                                              'R\s?\d[.]?\d?':'radius',
                                              '\d"':'unit_inch',
                                              '\d+[/-]\d+([/-]\d+)?':'date',
                                              '\d+\.\d+\.\d+':'invalid_float'}, inplace=True)

                    # Check specific filters
                    #s_texts[s_texts.loc[:,'invalid_float']==0]

                    # Create a filter for the significative numbers to be extracted
                    print('Filtering specific floats ...')
                    filter_number=(s_texts['has_digit']==1)\
                                    &(s_texts['date']==0)\
                                    &(s_texts['invalid_float']==0)

                    # Put the extracted numbers in a series for further analysis
                    s_texts_numbers = pd.to_numeric(s_texts[filter_number][s_texts_digits_cols_indexes].stack(),errors='coerce').dropna().to_frame()

                    # Create a filter to remove the points with a non-valid absolute value (taken at 600mm) and a zero value
                    filter_outliers_invalid=(s_texts_numbers<600)&(s_texts_numbers!=0)
                    s_texts_numbers=s_texts_numbers[filter_outliers_invalid].dropna().copy()
                    print(s_texts_numbers)

                    # Create a filter to remove the points above a certain distance to average on the positive side
                    # We select the mean of the distribution without the datapoints between 0 and 1 which correspond to the tolerancing information
                    filter_outliers_zscore=(s_texts_numbers-s_texts_numbers[s_texts_numbers>=1].mean())/s_texts_numbers[s_texts_numbers>=1].std()<4
                    s_texts_numbers_filtered_in=s_texts_numbers[filter_outliers_zscore].dropna()
                    s_texts_numbers_filtered_out=s_texts_numbers[~filter_outliers_zscore].dropna()
                    print(s_texts_numbers_filtered_in)
                    print(s_texts_numbers_filtered_out)

                    # Create a filter to highlight the lists of points with only integers; this means the drawing contains no float and is probably invalid
                    # Such as an assembly drawing with only integers to designate subcomponents
                    filter_outliers_allintegers=(s_texts_numbers_filtered_in-s_texts_numbers_filtered_in.apply(np.floor)).sum()

                    # Save the digits extracted as txt file for crosschecks
                    np.savetxt(filepath_out[:-4]+'.txt', s_texts_numbers_filtered_in, fmt='%.3f')

                    # Create a dataframe with the extracted floats
                    magnitudes=s_texts_numbers_filtered_in.sum().to_frame().T.drop([0], errors='ignore',axis=1)

                    # Split the extracted floats into 2 classes corresponding to the tolerance band and the main dimensions
                    tolerance_band_L=[i for i in s_texts_numbers_filtered_in[0] if i<1]
                    print(tolerance_band_L)
                    dimension_band_L=[i for i in s_texts_numbers_filtered_in[0] if i>=1]
                    print(dimension_band_L)
                    magnitudes['tolerance_band']=1
                    magnitudes['tolerance_band']=magnitudes['tolerance_band'].astype(object)
                    magnitudes.at[0, 'tolerance_band'] = tolerance_band_L
                    magnitudes['dimension_band']=1
                    magnitudes['dimension_band']=magnitudes['dimension_band'].astype(object)
                    magnitudes.at[0, 'dimension_band'] = dimension_band_L

                    # Store in the dataframe the flag corresponding to the warning for only integers detected
                    magnitudes['band_integers_only_warn']=np.where(filter_outliers_allintegers[0]==0,1,0)

                    # Store in the dataframe the flag corresponding to the outliers detected in the floats list, number of them and list
                    magnitudes['band_outliers_detected']=len(s_texts_numbers_filtered_out)
                    magnitudes['band_outliers']=1
                    magnitudes['band_outliers']=magnitudes['band_outliers'].astype(object)
                    magnitudes.at[0,'band_outliers']=[i for i in s_texts_numbers_filtered_out[0]]

                    # Display the full dataframe of the information concerning the extracted floats
                    print(magnitudes)

                    # Create the dataframe of features available in the drawing
                    print('Create the dataframe of features available in the drawing ...')
                    #columns_features=['⏤','⏥','○','⌭','⌒','⌓','⟂','∠','∥','⌯','⌖','◎','↗','⌰','Ⓕ','Ⓛ','Ⓜ','Ⓟ','Ⓢ','Ⓣ','Ⓤ',
                    #                  'iso2768','tol_class','holes','shafts','±','ref','⬜','Ra','N','Ø','°','radius','unit_inch']
                    s_texts_features=s_texts.drop([0,1,2,3,4,5,6,7], errors='ignore',axis=1).set_index('original').astype(int)
                    features=s_texts_features.sum(axis=0).to_frame().T

                    # Assemble final dataframe
                    print('Concatenate final dataframe with all features ...')
                    df_feature=pd.concat([magnitudes,features], axis=1)

                    # Register the pages informations
                    df_feature['pdf_page']=o+1
                    df_feature['pdf_pages']=len(outputs)                    

                    # Add computed statistics on original image pixels to the main dataframe
                    df_feature['im0_stats']=1
                    df_feature['im0_stats']=df_feature['im0_stats'].astype(object)
                    df_feature.at[0, 'im0_stats'] = list(im0_stats)

                    # Register the information on file processed in the dataframe
                    path_list=os.path.normpath(pdf_file).split(os.path.sep)
                    print(path_list)
                    for i in range(len(path_list)):
                        df_feature['path_{}'.format(i)]=path_list[i]
                    df_feature['file_name']=os.path.basename(pdf_file)
                    df_feature['pdf_file']=pdf_file

                    df_features=pd.concat([df_features, df_feature], axis=0).reset_index(drop=True)

            # Save dataframe as .npz file
            print('Save as npz file ...')
            np.savez(pdf_path+'/pdf_data_{}_from_{}.npz'.format(batch+1, batches), features=df_features, names=df_features.columns)
            print('npz file for this batch generated')

        except:
            print('Error in batch: current batch skipped.')
            skipped_batch+=1
        clean_pgm()
print('Done')
print('Number of batch skipped: ',skipped_batch)

In [None]:
df_features

#### Alternative text detect with local image and language hint
```python
def detect_document(path):
    """Detects document features in an image."""
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image,image_context={'language_hints': ['en-t-i0-handwrit']})

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            #print('\nBlock confidence: {}\n'.format(block.confidence))

            for paragraph in block.paragraphs:
                #print('Paragraph confidence: {}'.format(
                #    paragraph.confidence))

                for word in paragraph.words:
                    word_text = ''.join([
                        symbol.text for symbol in word.symbols
                    ])
                    #print('Word text: {} (confidence: {})'.format(
                    #    word_text, word.confidence))
                    
                    print(word_text)
                    
                    #for symbol in word.symbols:
                    #    print('\tSymbol: {} (confidence: {})'.format(
                    #        symbol.text, symbol.confidence))

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))


path_pdf='C:\\....\\img.jpg'
detect_document(path_pdf)
```

#### Alternative text detect with local image without language hint
```python
def detect_document(path):
    """Detects document features in an image."""
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)#,image_context={'language_hints': ['en-t-i0-handwrit']}

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            #print('\nBlock confidence: {}\n'.format(block.confidence))

            for paragraph in block.paragraphs:
                #print('Paragraph confidence: {}'.format(
                #    paragraph.confidence))

                for word in paragraph.words:
                    word_text = ''.join([
                        symbol.text for symbol in word.symbols
                    ])
                    #print('Word text: {} (confidence: {})'.format(
                    #    word_text, word.confidence))
                    
                    print(word_text)
                    
                    #for symbol in word.symbols:
                    #    print('\tSymbol: {} (confidence: {})'.format(
                    #        symbol.text, symbol.confidence))

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))


path_pdf='C:\\....\\img.jpg'
detect_document(path_pdf)
```

#### Alternative text detect with pdf on gs
```python
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(features=[feature], input_config=input_config,output_config=output_config)

    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[1]

    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])

    
gcs_source_uri='gs://exemple_pdf/doc.pdf'
gcs_destination_uri='gs://exemple_pdf/'
async_detect_document(gcs_source_uri, gcs_destination_uri)
```