In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import os
import threading
import pdfplumber

app = Flask(__name__)

UPLOAD_FOLDER = './uploads'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# 确保上传文件夹存在
if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)

# 文件上传接口
@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'message': 'No file part in the request'}), 400

    file = request.files['file']

    if file.filename == '':
        return jsonify({'message': 'No file selected for uploading'}), 400

    if file and allowed_file(file.filename):
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
        file.save(file_path)
        
        # 处理CSV文件或PDF文件
        if file.filename.lower().endswith('.csv'):
            try:
                df = pd.read_csv(file_path)
                # 在这里对数据进行一些基本分析（例如显示数据的列名和行数）
                analysis_result = {
                    'columns': df.columns.tolist(),
                    'row_count': len(df),
                }
            except Exception as e:
                return jsonify({'message': f'Error processing file: {str(e)}'}), 500
            
            return jsonify({'message': 'File uploaded successfully!', 'analysis_result': analysis_result}), 200
        elif file.filename.lower().endswith('.pdf'):
            try:
                # 使用 pdfplumber 读取 PDF 文件内容
                with pdfplumber.open(file_path) as pdf:
                    text = ""
                    for page in pdf.pages:
                        text += page.extract_text()
                
                # 返回前几行的内容作为示例
                preview_text = text[:500]  # 取前500个字符作为预览
                return jsonify({'message': 'PDF file uploaded successfully!', 'preview': preview_text}), 200
            except Exception as e:
                return jsonify({'message': f'Error processing PDF file: {str(e)}'}), 500
        else:
            return jsonify({'message': 'Invalid file type.'}), 400
    else:
        return jsonify({'message': 'Invalid file type. Please upload a CSV or PDF file.'}), 400


def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ['csv', 'pdf']

# 启动Flask应用程序的线程
if __name__ == '__main__':
    threading.Thread(target=lambda: app.run(host='127.0.0.1', port=5000, debug=False)).start()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m


In [4]:
import socket

def check_port_in_use(port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('127.0.0.1', port)) == 0

port = 5000
if check_port_in_use(port):
    print(f"Port {port} is in use.")
else:
    print(f"Port {port} is available.")


Port 5000 is in use.


In [None]:
import configparser
import os
import fitz  # PyMuPDF
from PIL import Image

# 读取配置文件
config = configparser.ConfigParser()
current_path = os.getcwd()
config.read(os.path.join(current_path, 'config.ini'))

# 设置输入和输出路径
input_path = config['DEFAULT'].get('Input-Path', os.path.join(current_path, 'input'))
output_path = config['DEFAULT'].get('Output-Path', os.path.join(current_path, 'output'))
debug = config['DEFAULT'].getboolean('Debug', False)

# 检查并创建输入路径
if not os.path.exists(input_path):
    os.makedirs(input_path)  # 自动创建路径
    print(f"Created input directory: {input_path}")

# 检查并创建输出路径
if not os.path.exists(output_path):
    os.makedirs(output_path)  # 自动创建路径
    print(f"Created output directory: {output_path}")

# 读取配置文件
config = configparser.ConfigParser()
current_path = os.getcwd()
config.read(os.path.join(current_path, 'config.ini'))

input_path = config['DEFAULT']['Input-Path']
output_path = config['DEFAULT']['Output-Path']
debug = config['DEFAULT'].getboolean('Debug')

# 确保输入和输出路径存在
if not os.path.exists(input_path):
    print(f"Input path does not exist: {input_path}")
    raise FileNotFoundError(f"Input path does not exist: {input_path}")

if not os.path.exists(output_path):
    os.makedirs(output_path)
    print(f"Created output directory: {output_path}")

# 定义提取图片的函数
def extract_images_from_pdf(pdf_path, output_folder):
    try:
        pdf_document = fitz.open(pdf_path)
    except Exception as e:
        print(f"Failed to open PDF file {pdf_path}: {e}")
        return

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        if debug:
            print(f"Created folder: {output_folder}")

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        if not image_list:
            if debug:
                print(f"No images found on page {page_num + 1}.")
            continue

        for image_index, img in enumerate(image_list):
            xref = img[0]
            try:
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                # 使用 page_num 和 image_index 确保文件名唯一
                image_name = f"image_{page_num + 1}_{image_index + 1}.{image_ext}"
                image_path = os.path.join(output_folder, image_name)

                with open(image_path, "wb") as image_file:
                    image_file.write(image_bytes)
                print(f"Saved image: {image_path}")

            except Exception as e:
                print(f"Failed to extract image {image_index + 1} on page {page_num + 1}: {e}")

# 定义渲染页面为图像的函数
def render_pdf_page_as_image(pdf_path, output_folder, zoom=2):
    try:
        pdf_document = fitz.open(pdf_path)
    except Exception as e:
        print(f"Failed to open PDF file {pdf_path}: {e}")
        return

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        if debug:
            print(f"Created folder: {output_folder}")

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        mat = fitz.Matrix(zoom, zoom)  # 放大倍率
        try:
            pix = page.get_pixmap(matrix=mat)
            image_name = f"page_{page_num + 1}.png"
            image_path = os.path.join(output_folder, image_name)
            pix.save(image_path)
            print(f"Rendered page {page_num + 1} as image: {image_path}")
        except Exception as e:
            print(f"Failed to render page {page_num + 1}: {e}")

# 获取 input_path 目录中的所有 PDF 文件
pdf_files = [f for f in os.listdir(input_path) if f.lower().endswith('.pdf')]

if not pdf_files:
    print("No PDF files found in the input directory.")
else:
    print(f"Found {len(pdf_files)} PDF file(s) in the input directory.")

# 对每个 PDF 文件进行图片提取和页面渲染
for pdf_file in pdf_files:
    pdf_path = os.path.join(input_path, pdf_file)

    # 为每个 PDF 创建单独的文件夹
    pdf_base_name = os.path.splitext(pdf_file)[0]
    images_output_folder = os.path.join(output_path, pdf_base_name, "extracted_images")
    rendered_pages_folder = os.path.join(output_path, pdf_base_name, "rendered_pages")

    print(f"\nProcessing PDF: {pdf_file}")

    # 提取图片
    print("Extracting images...")
    extract_images_from_pdf(pdf_path, images_output_folder)

    # 渲染页面为图像
    print("Rendering pages as images...")
    render_pdf_page_as_image(pdf_path, rendered_pages_folder, zoom=2)

    print(f"Completed processing {pdf_file}\n")

print("All PDF files have been processed.")



In [None]:
import os
import base64
import json
import csv

from dotenv import load_dotenv
import openai
from tqdm import tqdm

# Assuming you have already set up your OpenAI client
from openai import OpenAI

client = OpenAI(
    api_key="sk-OkrSl6y7ucYQ7VaO20E7F059Ae044b4f8381A08606C54c7c",
    base_url="https://www.gptapi.us/v1",
)

model_name = "gpt-4o"

# Create the output directory if it doesn't exist
os.makedirs(model_name, exist_ok=True)
output_path = f"{model_name}/processed_data.csv"

# Path to your extracted images
extracted_images_dir = r'.\output\ST-Engineering-Sustainability-Report2020\rendered_pages'

# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# System prompt and user instructions
system_prompt = 'You are an expert in data extraction, specializing in extracting tabular data from images.'
text_content = '''
Extract meaningful data, especially numbers and related textual information, 
from the image and convert it into a structured 
JSON format with the following fields: "Index class", "Index prefix", "Index name", 
"2019", "2020", "2021", "2022", and "2023". 
If any field does not exist, set its value to null. 
Please retain all data in its original format, especially numeric values, and avoid any translation or conversion to other languages. 
Skip extraction if there is no meaningful data in the image.
**Note:** There may be multiple sets of data in this format on a single page. Please extract all relevant information; otherwise, penalties may apply.
For example, the JSON structure should look like this: 
JSON structure
"""
{
    "Index class": "Environmental", 
    "Index prefix": "CO2 emissions at Scope 1 production facilities (t CO2 eq)", 
    "Index name": "Thermal generating plants", 
    "2019": "7123465",
    "2020": "9123465",
    "2021": "10423465",
    "2022": "10523465",
    "2023": "10623465"
}
"""
'''



with open(output_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Update the column headers
    writer.writerow(["Image Name", "Index class", "Index prefix", "Index name", "2019", "2020", "2021", "2022","2023"])  

    if os.path.exists(extracted_images_dir):
        for i in range(1, len(os.listdir(extracted_images_dir))):
            file_name = f'page_{i}.png'
            image_path = os.path.join(extracted_images_dir, file_name)
            if os.path.isfile(image_path):
                base64_image = encode_image(image_path)

                messages = [
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": text_content},
                        ],
                    },
                ]

                messages[-1]["content"].append(
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    }
                )

                resp = client.chat.completions.create(model=model_name,response_format={"type": "json_object"},messages=messages, temperature=0.0)
                extracted_data = resp.choices[0].message.content
                print(f"Extracted data for {file_name}: {extracted_data}")

            if extracted_data:
                try:
                    data = json.loads(extracted_data)
                    if isinstance(data, list):
                        for row in data:
                            index_class = row.get('Index class', '')
                            index_prefix = row.get('Index prefix', '')
                            index_name = row.get('Index name', '')
                            value_2019 = row.get('2019', '')
                            value_2020 = row.get('2020', '')
                            value_2021 = row.get('2021', '')
                            value_2022 = row.get('2022', '')
                            value_2023 = row.get('2023', '')
                            writer.writerow([file_name, index_class, index_prefix, index_name, value_2019, value_2020, value_2021, value_2022, value_2023])
                    elif isinstance(data, dict):
                        # 如果数据是单个字典
                        index_class = data.get('Index class', '')
                        index_prefix = data.get('Index prefix', '')
                        index_name = data.get('Index name', '')
                        value_2019 = data.get('2019', '')
                        value_2020 = data.get('2020', '')
                        value_2021 = data.get('2021', '')
                        value_2022 = data.get('2022', '')
                        value_2023 = data.get('2023', '')
                        writer.writerow([file_name, index_class, index_prefix, index_name, value_2019, value_2020, value_2021, value_2022, value_2023])
                    else:
                        # 处理意外的 JSON 结构
                        print(f"Unexpected JSON structure for {file_name}")
                        # 根据需求决定是否跳过或记录空值
                except json.JSONDecodeError as e:
                    print(f"JSON decode error for {file_name}: {e}")
                    # 根据需求决定是否跳过或记录空值
            else:
                # 如果 extracted_data 为空，则跳过
                print(f"Extracted data is empty for {file_name}")
                # 不进行任何写入操作

print(f"Data saved to {output_path}")

In [None]:
import os
import base64
import json
import csv
from dotenv import load_dotenv
import openai
from tqdm import tqdm

# 加载环境变量
load_dotenv()

# 设置 OpenAI API 密钥
openai.api_key = os.getenv("OPENAI_API_KEY")

model_name = "gpt-4"

# 创建输出目录
os.makedirs(model_name, exist_ok=True)
output_path = f"{model_name}/processed_data.csv"

# 提取图片的路径
extracted_images_dir = r'./output/ST-Engineering-Sustainability-Report2020/rendered_pages'

# 编码图片的函数
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 系统提示和用户指令
system_prompt = 'You are an expert in data extraction, specializing in extracting tabular data from images.'
text_content = '''
Extract meaningful data, especially numbers and related textual information, 
from the image and convert it into a structured 
JSON format with the following fields: "Index class", "Index prefix", "Index name", 
"2019", "2020", "2021", "2022", and "2023". 
If any field does not exist, set its value to null. 
Please retain all data in its original format, especially numeric values, and avoid any translation or conversion to other languages. 
Skip extraction if there is no meaningful data in the image.
'''

# 写入 CSV 文件
with open(output_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # 更新列标题
    writer.writerow(["Image Name", "Index class", "Index prefix", "Index name", "2019", "2020", "2021", "2022", "2023"])

    if os.path.exists(extracted_images_dir):
        image_files = [f for f in os.listdir(extracted_images_dir) if f.endswith('.png') or f.endswith('.jpg')]
        if not image_files:
            print(f"No image files found in the directory: {extracted_images_dir}")
        else:
            for file_name in tqdm(image_files, desc="Processing images"):
                image_path = os.path.join(extracted_images_dir, file_name)
                if os.path.isfile(image_path):
                    base64_image = encode_image(image_path)

                    # 创建对话消息
                    messages = [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": text_content},
                        {"role": "user", "content": f"data:image/jpeg;base64,{base64_image}"}
                    ]

                    try:
                        # 调用 OpenAI API
                        resp = openai.ChatCompletion.create(
                            model=model_name,
                            messages=messages,
                            temperature=0.0
                        )
                        extracted_data = resp.choices[0].message['content']
                        print(f"Extracted data for {file_name}: {extracted_data}")

                        if extracted_data:
                            try:
                                data = json.loads(extracted_data)
                                if isinstance(data, list):
                                    for row in data:
                                        index_class = row.get('Index class', '')
                                        index_prefix = row.get('Index prefix', '')
                                        index_name = row.get('Index name', '')
                                        value_2019 = row.get('2019', '')
                                        value_2020 = row.get('2020', '')
                                        value_2021 = row.get('2021', '')
                                        value_2022 = row.get('2022', '')
                                        value_2023 = row.get('2023', '')
                                        writer.writerow([file_name, index_class, index_prefix, index_name, value_2019, value_2020, value_2021, value_2022, value_2023])
                                elif isinstance(data, dict):
                                    # 如果数据是单个字典
                                    index_class = data.get('Index class', '')
                                    index_prefix = data.get('Index prefix', '')
                                    index_name = data.get('Index name', '')
                                    value_2019 = data.get('2019', '')
                                    value_2020 = data.get('2020', '')
                                    value_2021 = data.get('2021', '')
                                    value_2022 = data.get('2022', '')
                                    value_2023 = data.get('2023', '')
                                    writer.writerow([file_name, index_class, index_prefix, index_name, value_2019, value_2020, value_2021, value_2022, value_2023])
                                else:
                                    # 处理意外的 JSON 结构
                                    print(f"Unexpected JSON structure for {file_name}")
                            except json.JSONDecodeError as e:
                                print(f"JSON decode error for {file_name}: {e}")
                    except Exception as e:
                        print(f"An error occurred while processing {file_name}: {e}")

print(f"Data saved to {output_path}")
