 This script takes the raw data (from pdfs), extract the text, and convert it into a dataframe that we will use for further work. The dataframe is called:
 
 - Intermediate Data.csv

In [1]:
#!pip install pdf2image nltk spacy easyocr




In [2]:
#!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import os
import pandas as pd
from pdf2image import convert_from_path
import easyocr
import numpy as np

# Function to extrac text from PDF
def extract_text_from_pdf(pdf_path):
    text = ''
    images = convert_from_path(pdf_path)
    for page_number, img in enumerate(images, 1):
        img_array = np.array(img)
        page_text = easyocr.Reader(lang_list=['en']).readtext(img_array)
        page_text = ' '.join([result[1] for result in page_text])
        text += f'\n\nPage {page_number}:\n\n{page_text}'
    return text

# Functions to process PDFs and return a Dataframe
def process_pdfs_to_dataframe(folder_path, languages):
    data = {'Document': [], 'Text': []}

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)

            try:
                pdf_text = extract_text_from_pdf(pdf_path)
                data['Document'].append(filename)
                data['Text'].append(pdf_text)
                print(f'Texto extraído de {filename}.')
            except Exception as e:
                print(f'Erro ao processar {filename}: {str(e)}')

    df = pd.DataFrame(data)
    return df



In [5]:

folder_path = '/Users/eryclisrodrigues/Documents/Eryclis - docs/Research/Projects/AI Governance - Topic Modeling/Data/Raw Data Sampling/National Strategy'
languages = ['en']
df = process_pdfs_to_dataframe(folder_path, languages)


Texto extraído de [Switzerland] Digital Switzerland Strategy 2023.pdf.
Texto extraído de [Pakistan] National AI Policy Consultation Draft V1.pdf.
Texto extraído de [Saudi Arabia 2020] - National Strategy for Data and AI.pdf.
Texto extraído de [Malaysia] AI Road Map.pdf.
Texto extraído de [Hong Kong] Ethical_AI_Framework.pdf.
Texto extraído de Rawanda_Artificial_Intelligence_Policy.pdf.
Texto extraído de AI for Africa Artificial Intelligence for Africa's Socio Economic Development.pdf.
Texto extraído de [Spain] National-Strategy-on-AI.pdf.
Texto extraído de Tunisia_Startup Act.pdf.
Texto extraído de [Korea] National Strategy for Artificial Intelligence_200323.pdf.
Texto extraído de [Cambodia] AI Landscape in Cambodia - Current Status and Future Trends.pdf.
Texto extraído de South Africa_Fourth Industrial Revolution.pdf.
Texto extraído de [Singapore] AI Governance Framework.pdf.
Texto extraído de Australia_AI_Action_Plan_2021.pdf.
Texto extraído de [Germany] National AI Strategy.pdf.
Tex

In [6]:
# Visualize the DataFrame
print(df.head())

                                            Document  \
0  [Switzerland] Digital Switzerland Strategy 202...   
1  [Pakistan] National AI Policy Consultation Dra...   
2  [Saudi Arabia 2020] - National Strategy for Da...   
3                         [Malaysia] AI Road Map.pdf   
4               [Hong Kong] Ethical_AI_Framework.pdf   

                                                Text  
0  \n\nPage 1:\n\nSchweizerische Eidgenossenschaf...  
1  \n\nPage 1:\n\nMinistry of Information Technol...  
2  \n\nPage 1:\n\nNSDAI aibgll auuljiwJl yclikJl ...  
3  \n\nPage 1:\n\nKEMENTERIAN SAINS  TEKNOLOGI DA...  
4  \n\nPage 1:\n\nOffice of the Government Chief ...  


In [7]:
df['Text'][0]

'\n\nPage 1:\n\nSchweizerische Eidgenossenschaft Bundeskanzlei BK Confederation suisse Chancellerie federale ChF Confederazione Svizzera Cancelleria federale CaF Confederaziun svizra Federal Chancellery FCh Swiss Confederation Digital Switzerland Strategy 2023\n\nPage 2:\n\nDigital Switzerland Strategy 2023 PURPOSE The Digital Switzerland Strategy sets the guidelines for Switzerland\'s digital transformation\'. It is binding for the Federal Administration? and serves as an orientation for all other actors involved in digitalisation: The aim is for the population as a whole to benefit from a sustainable and respon- sible digital transformation. This is being driven forward jointly by the authorities at all federal levels, as well as by actors from civil society, business, academia and politics. Under focus themes, the Federal Council identifies two to three priorities each year as a way of launching digital transformation themes This focus is complemented by the action plan, which pro- 

In [22]:

df_folder_path = '/Users/eryclisrodrigues/Documents/Eryclis - docs/Research/Projects/AI Governance - Topic Modeling/Data/Intermediate Data/Intermediate Data.csv'


In [23]:
df.to_csv(df_folder_path, index=False)