**Method 1: Making use of average line lengths (Thought process: Multi-column docs will have smaller average line lengths, given that they cover more text in the same amount of page space as single-columns)**

In [148]:
import PyPDF2
import pandas as pd
import os
current_directory = os.getcwd()

In [149]:
def parse_pdf(pdf_file):
    #Parse PDF file and return text
    try:
        with open(pdf_file, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ''
            for page in range(len(pdf_reader.pages)):
                pdf_page = pdf_reader.pages[page]
                text += pdf_page.extract_text()
        return text
    except Exception as e:
        print(f"Error parsing {pdf_file}: {e}")
        return ""

In [150]:
def calculate_mean_line_length(text):
    #Calculating the mean line length of the text in pdf
    lines = text.split("\n")
    line_lengths = [len(line) for line in lines if line.strip()]
    mean_line_length = sum(line_lengths) / len(line_lengths)
    return mean_line_length

In [151]:
def parse_pdfs():
    mean_line_lengths = []
    results = []
    for filename in os.listdir(current_directory):
    #Check if the file is a PDF
        if filename.endswith(".pdf"):
        #Open the PDF file
            text = parse_pdf(filename)
            if text:
                mean_line_length = calculate_mean_line_length(text)
                mean_line_lengths.append(mean_line_length)
                results.append({"Document": os.path.basename(filename), "Mean Line Length": mean_line_length})

    # Calculate the average of all mean line lengths
    avg_mean_line_length = sum(mean_line_lengths) / len(mean_line_lengths)

    # Determine the text format for each PDF
    for result in results:
        if result["Mean Line Length"] < avg_mean_line_length:
            result["Text format"] = "Multi-column"
        else:
            result["Text format"] = "Single-column"

    return pd.DataFrame(results)

In [152]:
results_df = parse_pdfs()

print(results_df)


          Document  Mean Line Length    Text format
0      Aspirin.pdf         58.095477  Single-column
1      Bisodac.pdf         47.509240   Multi-column
2  Bisovell PI.pdf         56.156182   Multi-column
3     CoPlavix.pdf         58.472274  Single-column
4     Entresto.pdf         68.657975  Single-column
5   Fluoxetine.pdf         53.832168   Multi-column
6    Lentronat.pdf         62.443966  Single-column
7      Lexapro.pdf         70.043706  Single-column
8   Methycobal.pdf         56.686131   Multi-column
9     Stesolid.pdf         35.921053   Multi-column


**Method 2 (Searched google for pdf manipulation libraries and tried out pdfplumber extraction functions)**

In [153]:
import os
import pdfplumber
import pandas as pd
current_directory = os.getcwd()

In [154]:
#Initialise empty array to store results
results2 = []

In [155]:
#Loop through all files in the directory
for filename in os.listdir(current_directory):
    #Check if the file is a PDF
    if filename.endswith(".pdf"):
        #Open the PDF file
        with pdfplumber.open(filename) as pdf:
            p0 = pdf.pages[0]
            isMultiColumn = bool(p0.extract_table(dict(vertical_strategy='text', text_tolerance=12)))
            print(f"{filename}")
            if isMultiColumn:
                print(f"{filename} is Multi-column")
            text_format = "Multi-column" if isMultiColumn else "Single-column"
            results2.append({"Document Name": filename, "Text Format": text_format})

Aspirin.pdf
Bisodac.pdf
Bisodac.pdf is Multi-column
Bisovell PI.pdf
CoPlavix.pdf
Entresto.pdf
Fluoxetine.pdf
Fluoxetine.pdf is Multi-column
Lentronat.pdf
Lentronat.pdf is Multi-column
Lexapro.pdf
Lexapro.pdf is Multi-column
Methycobal.pdf
Methycobal.pdf is Multi-column
Stesolid.pdf
Stesolid.pdf is Multi-column


In [156]:
#Convert list of results to dataframe
results2_df = pd.DataFrame(results2)
print(results2_df)

     Document Name    Text Format
0      Aspirin.pdf  Single-column
1      Bisodac.pdf   Multi-column
2  Bisovell PI.pdf  Single-column
3     CoPlavix.pdf  Single-column
4     Entresto.pdf  Single-column
5   Fluoxetine.pdf   Multi-column
6    Lentronat.pdf   Multi-column
7      Lexapro.pdf   Multi-column
8   Methycobal.pdf   Multi-column
9     Stesolid.pdf   Multi-column


**Compare manual annotation to tested methods**

In [158]:
manual_labels=[]
manual_labels.append({"Document Name": "Aspirin.pdf", "Text Format": "single-column"})
manual_labels.append({"Document Name": "Bisodac.pdf", "Text Format": "multi-column"})
manual_labels.append({"Document Name": "Bisovell PI.pdf", "Text Format": "single-column"})
manual_labels.append({"Document Name": "CoPlavix.pdf", "Text Format": "single-column"})
manual_labels.append({"Document Name": "Entresto.pdf", "Text Format": "single-column"})
manual_labels.append({"Document Name": "Fluoxetine.pdf", "Text Format": "multi-column"})
manual_labels.append({"Document Name": "Lentronat.pdf", "Text Format": "multi-column"})
manual_labels.append({"Document Name": "Lexapro.pdf", "Text Format": "single-column"})
manual_labels.append({"Document Name": "Methycobal.pdf", "Text Format": "multi-column"})
manual_labels.append({"Document Name": "Stesolid.pdf", "Text Format": "multi-column"})
manual_label_df = pd.DataFrame(manual_labels)

print(results_df)
print("\n")

print(results2_df)
print("\n")
      
print(manual_label_df)

          Document  Mean Line Length    Text format
0      Aspirin.pdf         58.095477  Single-column
1      Bisodac.pdf         47.509240   Multi-column
2  Bisovell PI.pdf         56.156182   Multi-column
3     CoPlavix.pdf         58.472274  Single-column
4     Entresto.pdf         68.657975  Single-column
5   Fluoxetine.pdf         53.832168   Multi-column
6    Lentronat.pdf         62.443966  Single-column
7      Lexapro.pdf         70.043706  Single-column
8   Methycobal.pdf         56.686131   Multi-column
9     Stesolid.pdf         35.921053   Multi-column


     Document Name    Text Format
0      Aspirin.pdf  Single-column
1      Bisodac.pdf   Multi-column
2  Bisovell PI.pdf  Single-column
3     CoPlavix.pdf  Single-column
4     Entresto.pdf  Single-column
5   Fluoxetine.pdf   Multi-column
6    Lentronat.pdf   Multi-column
7      Lexapro.pdf   Multi-column
8   Methycobal.pdf   Multi-column
9     Stesolid.pdf   Multi-column


     Document Name    Text Format
0      Aspirin.p

**Approximately 80% accuracy**