In [None]:
"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os
from azure.core.credentials import AzureKeyCredential
import dotenv


dotenv.load_dotenv()
# Set the values of your computer vision endpoint and computer vision key
# as environment variables:
try:
    endpoint = os.environ["AZURE_COGNITIVE_ENDPOINT"]
    key = os.environ["AZURE_COGNITIVE_API_KEY"]
except KeyError:
    print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'")
    print("Set them before running this sample.")
    exit()

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""


# sample document
# formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
formUrl = "https://www-s3-live.kent.edu/s3fs-root/s3fs-public/file/Legal%20Glossary%20English%20Arabic%202020%20%282%29.pdf?VersionId=OMDkAUHJ0A2UeXeK3ykrZGITJNcFIxJg"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)


poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()


In [None]:
data = result.to_dict()

In [None]:
from easydict import EasyDict

result = EasyDict(data['analyzeResult'])

for idx, style in enumerate(result.styles):
    print(
        "Document contains {} content".format(
         "handwritten" if style.is_handwritten else "no handwritten"
        )
    )

# for page in result.pages:
#     for line_idx, line in enumerate(page.lines):
#         print(
#          "...Line # {} has text content '{}'".format(
#         line_idx,
#         line.content.encode("utf-8")
#         )
#     )

#     for selection_mark in page.selection_marks:
#         print(
#          "...Selection mark is '{}' and has a confidence of {}".format(
#          selection_mark.state,
#          selection_mark.confidence
#          )
#     )

for table_idx, table in enumerate(result.tables):
    print(table)
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table['rowCount'], table['columnCount']
        )
    )
        
    for cell in table.cells:
        print(
            "...table.cells[{}][{}] has content '{}'".format(
            cell.rowIndex,
            cell.columnIndex,
            cell.content,
            )
        )

print("----------------------------------------")



In [None]:
table.cells

In [None]:
import pandas as pd

# Assuming 'result' is the variable containing the data from the API call
all_tables = []  # This will store the DataFrame for each table

for table_idx, table in enumerate(result.tables):
    # print("Table # {} has {} rows and {} columns".format(table_idx, table['rowCount'], table['columnCount']))
    
    # Create an empty DataFrame
    df = pd.DataFrame(index=range(table['rowCount']), columns=range(table['columnCount']))
    
    for cell in table.cells:
        # Place the content in the correct row and column in the DataFrame
        df.at[cell.rowIndex, cell.columnIndex] = cell.content
    
    print(df.shape)
    all_tables.append(df)  # Append the DataFrame of this table to the list

# Now 'all_tables' contains all the tables as DataFrames
# For example, to view the first table you can use:
df

In [None]:
import numpy as np
df_combined = pd.concat(all_tables, axis=0)
# strip all strings
df_combined = df_combined.applymap(lambda x: x.strip() if isinstance(x, str) else x)


# Function to handle text overflow in DataFrame
def correct_overflow(df):
    rows, cols = df.shape
    blacklisted_rows = []
    for i in range(rows - 1, 0, -1):  # Start from the last row and go upwards
        if i in blacklisted_rows:
            continue
        if df.iloc[i].isnull().sum() < df.shape[1]:  # Check if row has missing items
            for j in range(cols):
                if pd.notna(df.iloc[i, j]) and df.iloc[i, j] != '':  # Found the non-empty cell)
                    # print("detected in row", i, "column", j)
                    df.iloc[i-1, j] = str(df.iloc[i-1, j]) + ' ' + str(df.iloc[i, j])  # Append text to the cell above
                    df.iloc[i, j] = ''  # Clear the overflowed cell
                    blacklisted_rows.append(i-1)
                    break


correct_overflow(df_combined)
# drop all rows with any empty cell (even empty string)
df_combined.replace('', np.nan, inplace=True)
df_combined = df_combined.dropna(how='any', axis=0)

# set first row to be columns header
df_combined.columns = df_combined.iloc[0]
df_combined = df_combined[1:]
df_combined.iloc[-1, -1]
df_combined.shape
df_combined.to_csv('EN-AR Glossary of Legal Terms.csv', index=False)

In [None]:
df_combined.dropna().iloc[4, 0]

In [None]:
df.shape[0]

In [None]:
df.iloc[3,2]

In [None]:
import pandas as pd

# Sample DataFrame creation for demonstration

# Example DataFrame for demonstration
data = {
    'Column1': ['data3', '',                '',     'data5'],
    'Column2': ['data2', '+ 2', '',      ''],
    'Column3': ['data1', '',                '+ 1', 'data4']
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)
print()

# Function to correct overflow issues in the DataFrame
def correct_overflow(df):
    # Loop over each row in reverse order, starting from the second last row
    for i in range(df.shape[0] - 2, 0, 1): # row wise
        for col in range(df.shape[1] - 1, 0, 1):
            # Check if the current row and cell is empty and the row below has text in the same column

            if not pd.isna(df.iloc[i, col]) and not pd.isna(df.iloc[i + 1, col]):
                print("Overflow detected in row", i, "column", col)
                # Append the text from the row below to the row above
                df.iloc[i, col] = df.iloc[i + 1, col] + df.iloc[i, col]
                # Clear the text in the row below after moving it up
                # df.iloc[i + 1, col] = None

# Function to handle text overflow in DataFrame
def correct_overflow(df):
    rows, cols = df.shape
    blacklisted_rows = []
    for i in range(rows - 1, 0, -1):  # Start from the last row and go upwards
        if i in blacklisted_rows:
            continue
        if df.iloc[i].isnull().sum() < df.shape[1]:  # Check if row has missing items
            for j in range(cols):
                if pd.notna(df.iloc[i, j]) and df.iloc[i, j] != '':  # Found the non-empty cell)
                    print("detected in row", i, "column", j)
                    df.iloc[i-1, j] += ' ' + df.iloc[i, j]  # Append text to the cell above
                    df.iloc[i, j] = ''  # Clear the overflowed cell
                    blacklisted_rows.append(i-1)
                    break

# Apply the function to the DataFrame
correct_overflow(df)

# Display the modified DataFrame
print("----")
print(df)


In [None]:
data = {
    'Column1': ['data1', '', '', 'data4'],
    'Column2': ['', 'data2 continues below', '', 'data4'],
    'Column3': ['data1', '', 'data3 continues from above', 'data4']
}
df = pd.DataFrame(data)
df

In [None]:

# Call the function to fix the DataFrame
fix_overflow(df)
df

In [None]:
table.cells[1]