In [2]:
import pdfplumber
import pandas as pd
import numpy as np
import os
import textEx
from datetime import datetime
import pandas as pd

In [7]:
def readAllZheLangPDF(dir:str) -> pd.DataFrame:
    column_names=['name','date']
    all_pages_df=pd.DataFrame(columns=column_names)
    for filename in os.listdir(dir):
        contents=""
        filtered_lines = []
        if filename.endswith('.pdf'):
            file_path = os.path.join(dir, filename)
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    contents += page.extract_text()
                    lines = contents.split('\n')
                    for line in lines:
                        if line.strip().isdigit() or (line.strip() and line[0].isdigit()):
                            filtered_lines.append(line)
                        else:
                            continue
                df = pd.DataFrame(filtered_lines, columns=['name'])
                df['date'] = filename.replace(".pdf","").replace("哲朗現貨","")
                try:
                    all_pages_df = pd.concat([all_pages_df, df], ignore_index=True)
                except:
                    print(f"wrong {filename}")
    return all_pages_df

In [8]:
df=readAllZheLangPDF(r"quotation\哲朗")

In [11]:
import re

In [14]:
def convert_to_standard_date(date_str):
    date_formats = ["%m月%d日", "%d-%m-%Y", "%Y-%m-%d", "%Y年%m月%d日"]
    standard_format = "%Y-%m-%d"
    default_year = 2024

    for date_format in date_formats:
        try:
            date_obj = datetime.strptime(date_str, date_format)
            if "%Y" not in date_format:  # 如果格式不包含年份信息，则使用默认年份
                date_obj = date_obj.replace(year=default_year)
            return date_obj.strftime(standard_format)
        except ValueError:
            continue

    return "Invalid date format"
def _specToList(spec :list) -> list:

    # Sort 'spec' in alphabetical order
    spec.sort()

    # Initialize variables with empty strings
    spec1 = spec2 = spec3 = spec4 = spec5 = spec6 = ""

    # Assign values from 'spec' to 'spec1' to 'spec6' based on the length of 'spec'
    if len(spec) > 0:
        spec1 = spec[0]
    if len(spec) > 1:
        spec2 = spec[1]
    if len(spec) > 2:
        spec3 = spec[2]
    if len(spec) > 3:
        spec4 = spec[3]
    if len(spec) > 4:
        spec5 = spec[4]
    if len(spec) > 5:
        spec6 = spec[5]

    return [spec1,spec2,spec3,spec4,spec5,spec6]
def extract_price(text):
    matches = re.findall(r'\d+\.\d+', text)
    if matches:
        return matches[-1]
    else:
        return None
def getZheLangQuote(df: pd.DataFrame) -> pd.DataFrame:
    
    df['specs'] = df['name'].astype(str).apply(textEx.getSpec)
    df['brand'] = df['name'].astype(str).apply(textEx.getBrand)
    for index, row in df.iterrows():
        if row['brand'] is not None:
            df.at[index, 'productName'] = row['name'].replace(row['brand'], '')
        else:
            df.at[index, 'productName'] = row['name']
    df['productTag'] = df['name'].astype(str).apply(textEx.getProduct)
    df['supplier'] = '哲朗'
    df['category'] = df['name'].astype(str).apply(textEx.getCategory)
    df['packing']=df['name'].astype(str).apply(textEx.getPacking)
    df['origin'] = df['name'].astype(str).apply(textEx.getCountry)
    df['effectiveDate']=df['date'].astype(str).apply(convert_to_standard_date)
    df[['spec1', 'spec2', 'spec3', 'spec4', 'spec5', 'spec6']] = df['specs'].apply(lambda x: pd.Series(_specToList(x)))

    df['price'] = df['name'].apply(extract_price)
    df['weightUnit'] = df['name'].astype(str).apply(textEx.getWeightUnit)
    df['warehouse']=df['name'].astype(str).apply(textEx.getWarehoue)
    df['notes']= None

    df = df.explode('warehouse')
    df['warehouse'] = df['warehouse'].astype(str)

    required_columns = [
    'productName', 'productTag', 'supplier', 'category', 'packing',
    'origin', 'brand', 'effectiveDate', 'spec1', 'spec2', 'spec3',
    'spec4', 'spec5', 'spec6', 'price', 'weightUnit', 'warehouse', 'notes'
    ]
    df=df[required_columns]
    return df

In [15]:
dfout=getZheLangQuote(df)

In [17]:
dfout.to_csv("哲朗.csv")