# 1. Preprocessing

## 1.1. Install Library

### !pip

In [None]:
# Install Tesseract package
!apt-get install -y tesseract-ocr
# Install pytesseract
!pip install -q pytesseract
# Install Tesseract data files for Vietnamese language
!apt-get install -y tesseract-ocr-vie

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-vie is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
# Install library for converting pdf to jpg
!pip install -q pdf2image
!apt-get install -y poppler-utils
!pip install -q pdfminer
!pip install -q pdfminer.six
!pip install -q PyPDF2
!pip install -q PyMuPDF

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


### import

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams
import os
import numpy as np
import pandas as pd
import PyPDF2
import fitz

import pytesseract
import PIL.Image
import cv2

from pytesseract import Output

import re

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## 1.2. Define path

In [None]:
folder_path = "/content/drive/MyDrive/ESG/data" # Change this to your folder's path

# Initialize an empty list to store directory paths
data = []

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".pdf"):
            # Create the file path
            file_path = os.path.join(root, file)

            # Extract the company name from the file path
            company_name = root.split("/")[-1]

            # Append the data to zthe list
            data.append({"Name": file, "Path": file_path})
            # print(file)

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Path
0,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...
1,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...
2,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...
3,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...
4,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...
...,...,...
518,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...
519,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...
520,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...
521,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...


In [None]:
df["Year"] = df["Name"].str.extract(r"(\d{4})")
df

Unnamed: 0,Name,Path,Year
0,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2017
1,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2018
2,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2019
3,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2020
4,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2022
...,...,...,...
518,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2019
519,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2020
520,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2021
521,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2022


In [None]:
df["Year"].value_counts()

Unnamed: 0_level_0,count
Year,Unnamed: 1_level_1
2017,33
2020,32
2022,32
2023,32
2018,32
2019,31
2013,30
2021,30
2016,30
2011,29


In [None]:
df['Year'] = df['Year'].astype(int)
df = df[df['Year'] > 2013]
df = df.reset_index(drop = True)
df

Unnamed: 0,Name,Path,Year
0,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2017
1,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2018
2,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2019
3,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2020
4,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2022
...,...,...,...
303,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2019
304,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2020
305,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2021
306,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2022


In [None]:
df["Company"] = df["Name"].str.extract(r"^(.*?)\s*-\s*\d{4}\.pdf$")
df

Unnamed: 0,Name,Path,Year,Company
0,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2017,Ngân hàng TNHH MTV Public Bank Việt Nam
1,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2018,Ngân hàng TNHH MTV Public Bank Việt Nam
2,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2019,Ngân hàng TNHH MTV Public Bank Việt Nam
3,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2020,Ngân hàng TNHH MTV Public Bank Việt Nam
4,Ngân hàng TNHH MTV Public Bank Việt Nam - ...,/content/drive/MyDrive/ESG/data/Ngân hàng TN...,2022,Ngân hàng TNHH MTV Public Bank Việt Nam
...,...,...,...,...
303,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2019,Ngân hàng TMCP Bưu điện Liên Việt
304,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2020,Ngân hàng TMCP Bưu điện Liên Việt
305,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2021,Ngân hàng TMCP Bưu điện Liên Việt
306,Ngân hàng TMCP Bưu điện Liên Việt - 20...,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2022,Ngân hàng TMCP Bưu điện Liên Việt


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     308 non-null    object
 1   Path     308 non-null    object
 2   Year     308 non-null    int64 
 3   Company  308 non-null    object
dtypes: int64(1), object(3)
memory usage: 9.8+ KB


In [None]:
df.to_excel('/content/drive/MyDrive/ESG/recheck_2.xlsx', index = False)

In [None]:
company_years = df.groupby('Company')['Year'].nunique().reset_index()
company_years

Unnamed: 0,Company,Year
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,10
1,Ngân hàng Chính sách xã hội Việt Nam,9
2,Ngân hàng Hợp tác xã Việt Nam,10
3,Ngân hàng TMCP An Bình,10
4,Ngân hàng TMCP Á Châu,10
5,Ngân hàng TMCP Bắc Á,10
6,Ngân hàng TMCP Bản Việt,10
7,Ngân hàng TMCP Bảo Việt,7
8,Ngân hàng TMCP Bưu điện Liên Việt,9
9,Ngân hàng TMCP Công thương Việt Nam,9


In [None]:
filtered_company_years = company_years[company_years['Year'] == 10].reset_index(drop=True)

# Display or save the filtered DataFrame
filtered_company_years

Unnamed: 0,Company,Year
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,10
1,Ngân hàng Hợp tác xã Việt Nam,10
2,Ngân hàng TMCP An Bình,10
3,Ngân hàng TMCP Á Châu,10
4,Ngân hàng TMCP Bắc Á,10
5,Ngân hàng TMCP Bản Việt,10
6,Ngân hàng TMCP Hàng Hải,10
7,Ngân hàng TMCP Kiên Long,10
8,Ngân hàng TMCP Kỹ Thương,10
9,Ngân hàng TMCP Nam Á,10


In [None]:
with pd.ExcelWriter('/content/drive/MyDrive/ESG/recheck_2.xlsx', engine='openpyxl', mode='a') as writer:
    filtered_company_years.to_excel(writer, sheet_name='company name', index=False)

## 1.2.1. tạo path cho filter company

In [None]:
import pandas as pd

# Đường dẫn tới file Excel
excel_path = '/content/drive/MyDrive/ESG/recheck_2.xlsx'  # Thay đổi đường dẫn nếu cần

# Đọc Sheet1 và Sheet2 từ file Excel
sheet1 = pd.read_excel(excel_path, sheet_name='Sheet1')
sheet2 = pd.read_excel(excel_path, sheet_name='company name')

# Đảm bảo rằng cột 'Year' trong cả hai sheet là kiểu số
sheet1['Year'] = pd.to_numeric(sheet1['Year'], errors='coerce')
sheet2['Year'] = pd.to_numeric(sheet2['Year'], errors='coerce')

# Loại bỏ các hàng có giá trị 'Year' không hợp lệ
sheet1 = sheet1.dropna(subset=['Year'])
sheet2 = sheet2.dropna(subset=['Year'])

# Tạo danh sách để lưu kết quả lọc
filtered_results = []

# Duyệt qua từng hàng trong Sheet2 để lọc dữ liệu từ Sheet1
for idx, row in sheet2.iterrows():
    company = row['Company']
    num_years = int(row['Year'])  # Số năm tùy chỉnh

    # Lọc các bản ghi của công ty hiện tại trong Sheet1
    company_data = sheet1[sheet1['Company'] == company]

    if not company_data.empty:
        # Tìm năm lớn nhất cho công ty này
        max_year = company_data['Year'].max()

        # Tính năm bắt đầu dựa trên số năm tùy chỉnh
        start_year = max_year - num_years + 1

        # Lọc các bản ghi thỏa mãn điều kiện từ start_year đến max_year
        filtered_company_data = company_data[(company_data['Year'] >= start_year) & (company_data['Year'] <= max_year)]

        # Thêm các bản ghi đã lọc vào danh sách kết quả
        filtered_results.append(filtered_company_data)
    else:
        print(f"No data found for company: {company}")

# Kết hợp tất cả các kết quả lại thành một DataFrame duy nhất
if filtered_results:
    final_df = pd.concat(filtered_results, ignore_index=True)

    # Thêm 2 cột mới: Lĩnh vực và Type of PDF
    final_df['Lĩnh vực'] = 'Ngân hàng'
    final_df['Type of PDF'] = 'Digitally Created PDF'

    # Sắp xếp dữ liệu theo Company và Year (tùy chọn)
    final_df = final_df.sort_values(by=['Company', 'Year'], ascending=[True, False])

    # Đường dẫn để lưu file Excel kết quả
    output_path = '/content/drive/MyDrive/ESG/filtered_results.xlsx'  # Thay đổi đường dẫn nếu cần

    # Lưu DataFrame kết quả vào file Excel mới
    final_df.to_excel(output_path, index=False)

    print(f"Filtered results with new columns saved to {output_path}")
else:
    print("No matching data found based on the criteria.")


Filtered results with new columns saved to /content/drive/MyDrive/ESG/filtered_results.xlsx


## 1.3. Detect types of pdf

### 1.3.1. Có chạy

In [None]:
list_pdf_path = '/content/drive/MyDrive/ESG/filtered_results.xlsx'
list_pdf = pd.read_excel(list_pdf_path, sheet_name = 'Sheet1')
list_pdf

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF


In [None]:
path = "/content/drive/MyDrive/ESG/data/"

### ko chạy phân loại pdf

In [None]:
def read_pdf_content(file_path):
    text_content = ""
    try:
        with open(file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)
            for page in range(num_pages):
                page_obj = pdf_reader.pages[page]
                text_content += page_obj.extract_text()
        return text_content
    except Exception as e:
        print(f"Error reading PDF file '{file_path}': {str(e)}")
        return ""

In [None]:
'''
    Analyze the PDF content:
        *Image-only PDF: If the extracted text content is empty => image-only
        *Digitally created PDF: If the extracted text content is not empty & the PDF file does not contain scanned images or picture => digitally created pdf
        *Searchable PDF: If the extracted text content is not empty & the PDF file contains searchable text => searchable pdf
'''
def is_image_only_pdf(file_path):
    content = read_pdf_content(file_path)
    return len(content.strip()) == 0

def is_digitally_created_pdf(file_path):
    try:
        content = read_pdf_content(file_path)
        return len(content.strip()) > 0
    except Exception as e:
        print(f"Error reading PDF file '{file_path}': {str(e)}")
        return False

# def is_searchable_pdf(file_path):
#     content = read_pdf_content(file_path)
#     return len(content.strip()) > 0

In [None]:
import fitz  # PyMuPDF

def read_pdf_content_2(path):
    text_content = ""

    for page_layout in extract_pages(path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line, LTTextContainer):
                        for character in text_line:
                            if isinstance(character, LTChar):
                                text_content += character
    return text_content

def is_image_only_pdf_2(file_path):
    content = read_pdf_content_2(file_path)
    return len(content.strip()) == 0

def is_digitally_created_pdf_2(file_path):
    try:
        content = read_pdf_content_2(file_path)
        return len(content.strip()) > 0
    except Exception as e:
        print(f"Error reading PDF file '{file_path}': {str(e)}")
        return False

In [None]:
# code lưu


### ko chạy tại giống cái phân loại chỉ là thử code khác

In [None]:
import pandas as pd
import os
import PyPDF2  # Đảm bảo rằng bạn đã cài đặt PyPDF2

# Đọc danh sách các tệp PDF từ file Excel
list_pdf_path = '/content/drive/MyDrive/ESG/recheck_2.xlsx'
list_pdf = pd.read_excel(list_pdf_path, sheet_name='company name')

# Đường dẫn chứa các tệp PDF
path = "/content/drive/MyDrive/ESG/data/"

# Hàm đọc nội dung từ tệp PDF
def read_pdf_content(file_path):
    text_content = ""
    try:
        with open(file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)
            for page in range(num_pages):
                page_obj = pdf_reader.pages[page]
                text = page_obj.extract_text()
                if text:
                    text_content += text
        return text_content
    except Exception as e:
        print(f"Error reading PDF file '{file_path}': {str(e)}")
        return ""

# Hàm phân loại PDF
def classify_pdf(file_path):
    content = read_pdf_content(file_path)
    if len(content.strip()) == 0:
        return "Image-only PDF"  # PDF chỉ chứa hình ảnh
    else:
        return "Digitally-created PDF"  # PDF có chứa văn bản

# Xử lý các tệp trong thư mục và phân loại chúng
results = []

# Duyệt qua từng tệp PDF trong danh sách
for index, row in list_pdf.iterrows():
    # Lấy tên công ty hoặc tệp PDF từ Excel
    pdf_file_name = row['Company']  # Đảm bảo rằng tên cột là 'Company'
    file_path = os.path.join(path, f"{pdf_file_name}.pdf")

    if os.path.exists(file_path):  # Kiểm tra nếu tệp PDF tồn tại
        pdf_type = classify_pdf(file_path)  # Phân loại PDF
        pdf_info = {
            'file_path': file_path,
            'pdf_type': pdf_type,  # Loại PDF (hình ảnh hoặc số)
        }
        results.append(pdf_info)

# Chuyển kết quả thành DataFrame
df_results = pd.DataFrame(results)

# Lưu kết quả vào file Excel
output_path = '/content/drive/MyDrive/ESG/processed_pdf_results.xlsx'
df_results.to_excel(output_path, index=False)

print(f"Results saved to {output_path}")


Results saved to /content/drive/MyDrive/ESG/processed_pdf_results.xlsx


### 1.3.2. Continue

In [None]:
df_ = '/content/drive/MyDrive/ESG/filtered_results.xlsx'
df_ = pd.read_excel(df_, sheet_name = 'Sheet1')
df_

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF


In [None]:
df = df_
df_

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF


In [None]:
'''
    Create a dataframe containing the type of pdf files
'''
# Initialize an empty list to store the data
for i in range(len(df_)):
      if df_['Type of PDF'][i] == 'Image-Only PDF':
          if is_digitally_created_pdf_2(df_['Path'][i]):
              pdf_type = "Digitally Created PDF"
          elif is_image_only_pdf_2(df_['Path'][i]):
              pdf_type = "Image-Only PDF"
          else:
              pdf_type = "Unknown"

          print(i)
          df_['Type of PDF'][i] = pdf_type

df_

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF


In [None]:
df.to_excel("/content/drive/MyDrive/ESG/Pdf Types_FINAL.xlsx", index=False)

## 1.4. Load list of pdf

In [None]:
df_esg = pd.read_excel("/content/drive/MyDrive/ESG/Pdf Types_FINAL.xlsx")
df_esg

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF
...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF


In [None]:
service = df_esg[df_esg['Lĩnh vực'] == 'Ngân hàng']

service_digital = service[service['Type of PDF'] == 'Digitally Created PDF']
service_image = service[service['Type of PDF'] == 'Image-Only PDF']

service_digital = service_digital.reset_index(drop = True)
service_image = service_image.reset_index(drop = True)

## 1.5. Extract text from Digitally Created Pdf


In [None]:
import fitz  # PyMuPDF
import re
import pandas as pd

def extract_text_to_df(path):
    Extract_Data = []

    # Mở tệp PDF với PyMuPDF
    doc = fitz.open(path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        # Đoạn mã này giả sử bạn muốn tính kích thước phông chữ
        font_sizes = []
        for text_line in text.split('\n'):
            font_sizes.append(len(text_line))  # Chỉ là ví dụ, bạn cần logic tính kích thước font thực tế.

        # Tính trung bình kích thước font
        if font_sizes:
            average_font_size = round(sum(font_sizes) / len(font_sizes))
            Extract_Data.append([average_font_size, text])

    # Extract the year using regular expression
    year_match = re.search(r'\b(\d{4})\b', path)
    year = year_match.group(1) if year_match else None

    # Extract the name by splitting the path
    name = path.split('/')[-2]

    # Tạo DataFrame
    text_df = pd.DataFrame(Extract_Data, columns = ['size', 'text'])
    text_df['year'] = year
    text_df['company_name'] = name
    text_df = text_df[['company_name', 'year', 'size', 'text']]

    return text_df


In [None]:
def filter_size(df):
    # Tính tần suất và tỷ lệ tích lũy
    size_counts = df['size'].value_counts()
    cumulative_percentage = size_counts.cumsum() / size_counts.sum()

    # Lọc các kích thước có tỷ lệ tích lũy < 80%
    selected_sizes = size_counts[cumulative_percentage <= 0.8].index.tolist()

    # Nếu không có kích thước nào hợp lệ, chọn kích thước xuất hiện nhiều nhất
    if not selected_sizes:
        if size_counts.empty:
            default_size = 9  # Kích thước mặc định nếu không có kích thước
            selected_sizes = [default_size]
        else:
            selected_sizes = [size_counts.idxmax()]

    # Lọc DataFrame theo các kích thước đã chọn
    df = df[df['size'].isin(selected_sizes)]
    return df


In [None]:
def filter_sentence(df):
    df = df[df['text'].str.contains(r'\.')]  # Lọc các câu có dấu chấm
    df = df[~df['text'].str.match(r'^[\d\s\n.]+$')]  # Loại bỏ các câu chỉ có số và dấu chấm
    df = df[~df['text'].str.match(r'^\d+\.*')]  # Loại bỏ các số với dấu chấm
    df = df[~df['text'].str.contains(r'^\(\d+(\.\d+)*\)$')]  # Loại bỏ các số trong ngoặc
    df['text'] = df['text'].str.replace(r'(\d+)\.(\d+)', r'\1\2')  # Xóa dấu chấm trong số
    df = df.reset_index(drop=True)
    return df


In [None]:
def extract_final(path):
    # Xử lý các bước trước
    df = extract_text_to_df(path)
    df = filter_size(df)
    df = filter_sentence(df)

    # Kết hợp văn bản thành một chuỗi & tách theo dấu chấm
    stacked_text = ' '.join(df['text']).replace('\n', ' ').replace('\t', ' ').split('.')
    split_text = [text.strip() for text in stacked_text if text.strip()]
    split_text = [text + '.' for text in split_text]

    # Tạo biến name và year
    name = path.split('/')[-2]
    year_match = re.search(r'\b(\d{4})\b', path)
    year = year_match.group(1) if year_match else None

    # Tạo DataFrame mới
    new_df = pd.DataFrame({
        'name': name,
        'year': year,
        'text': split_text,
        'label': ''
    })

    new_df = new_df.drop_duplicates()
    new_df = new_df.reset_index(drop = True)
    return new_df


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Khởi tạo DataFrame rỗng để chứa kết quả
service_digital_text = pd.DataFrame()

# Lặp qua từng đường dẫn trong cột 'Path'
for path in service_digital['Path']:
    # Gọi hàm extract_final() để lấy dữ liệu
    result_df = extract_final(path)

    # Gộp DataFrame tạm thời vào DataFrame chính
    service_digital_text = pd.concat([service_digital_text, result_df], ignore_index=True)
    print(f"Processed {path}")


Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2023.pdf
Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2022.pdf
Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2021.pdf
Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2020.pdf
Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2019.pdf
Processed /content/drive/MyDrive/ESG/data/Ngân hàng NN&PT Nông thôn Việt Nam/NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN NÔNG THÔN VIỆT NAM - 2018.pdf
Processed /content/drive/MyD

In [None]:
def split_and_strip(text):
    parts = text.split('.')
    if len(parts) > 1:
        second_part = parts[0].strip()
        return second_part
    else:
        return None
def process_path(path):
    match = re.search(r'^(.*?)\s*-\s*\d{4}\.pdf$', path)
    if match:
        extracted_string = match.group(1)
        extracted_string = split_and_strip(extracted_string)
        return extracted_string
    else:
        return None

df_['PDF_Name'] = df_['Path'].apply(process_path)
df_

Unnamed: 0,Name,Path,Year,Company,Lĩnh vực,Type of PDF,PDF_Name
0,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2023,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF,
1,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2022,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF,
2,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2021,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF,
3,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2020,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF,
4,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,/content/drive/MyDrive/ESG/data/Ngân hàng NN...,2019,NGÂN HÀNG NÔNG NGHIỆP VÀ PHÁT TRIỂN N...,Ngân hàng,Digitally Created PDF,
...,...,...,...,...,...,...,...
225,Ngân hàng TMCP Đông Nam Á - 2018.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2018,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF,
226,Ngân hàng TMCP Đông Nam Á - 2017.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2017,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF,
227,Ngân hàng TMCP Đông Nam Á - 2016.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2016,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF,
228,Ngân hàng TMCP Đông Nam Á - 2015.pdf,/content/drive/MyDrive/ESG/data/Ngân hàng TM...,2015,Ngân hàng TMCP Đông Nam Á,Ngân hàng,Digitally Created PDF,


In [None]:
service_digital_text.to_csv('/content/drive/MyDrive/ESG/service_digital_text.csv', escapechar='\\')

In [None]:
service_digital_text

Unnamed: 0,name,year,text,label
0,Ngân hàng NN&PT Nông thôn Việt Nam,2023,DẤU ẤN TIÊU BIỂU 1.,
1,Ngân hàng NN&PT Nông thôn Việt Nam,2023,Kỷ niệm 35 năm thành lập Agribank Agribank vi...,
2,Ngân hàng NN&PT Nông thôn Việt Nam,2023,"Đây là sự kiện quan trọng, có ý nghĩa to lớn đ...",
3,Ngân hàng NN&PT Nông thôn Việt Nam,2023,2.,
4,Ngân hàng NN&PT Nông thôn Việt Nam,2023,Tiên phong thể hiện trách nhiệm của Ngân hàng ...,
...,...,...,...,...
11895,Ngân hàng TMCP Đông Nam Á,2016,* Taipei Fubon Commercial Bank Co.,
11896,Ngân hàng TMCP Đông Nam Á,2016,", Ltd* Taiwan Cooperative Bank Co.",
11897,Ngân hàng TMCP Đông Nam Á,2016,* Taiwan Shin Kong Commercial Bank Co.,
11898,Ngân hàng TMCP Đông Nam Á,2016,"Wells Fargo Bank, N.",
