In [None]:
import re
import random
import pdfplumber
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import sys

# Generate a list of random dates from the last 7 days
def random_date():
    return (datetime.today() - timedelta(days=random.randint(0, 6))).strftime("%Y-%m-%d")

# Load data from multiple CSV files for training
def load_training_data(files):
    return pd.concat([pd.read_csv(file)[['Item Name', 'Category']] for file in files], ignore_index=True)

# Extract store names and suburbs from PDF
def extract_stores_from_pdf(pdf_path):
    stores = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_table()
            if tables:
                for row in tables[1:]:
                    filtered_row = [cell for cell in row if cell]
                    if len(filtered_row) < 3:
                        continue
                    store_name, location = filtered_row[0], " ".join(filtered_row[-6:])
                    stores.append((store_name, location))
    return stores

# Train model on Woolworths and Coles data
df_train = load_training_data(['woolworths_cleaned.csv', 'Coles.csv'])  
X_train, X_test, y_train, y_test = train_test_split(df_train['Item Name'], df_train['Category'], test_size=0.2, random_state=42)

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=500))
])

model.fit(X_train, y_train)
print("Training Accuracy:", accuracy_score(y_test, model.predict(X_test)))

# Function to clean and process IGA dataset
def clean_and_restructure_data(file_path, stores_list):
    df = pd.read_csv(file_path, encoding='ISO-8859-1').dropna(how='all')
    if df['Product Code'].isnull().any():
        sys.exit("Error: 'Product Code' column contains missing values.")
    
    df.rename(columns={
        'Product Code': 'item_id',
        'Item Name': 'item_name',
        'Best Unit Price': 'best_price',
        'Unit Price': 'unit_price',
        'Best Price': 'total_price', 
        'Price Was': 'price_was'
    }, inplace=True)
    df.drop(columns=['Item Price', 'Special Text', 'Link'], inplace=True, errors='ignore')

    df['best_price'] = df['best_price'].str.extract(r'(\d+\.\d+)').astype(float)
    df['price_was'] = df['price_was'].str.replace('$', '', regex=False).astype(float)
    df['discount'] = df['price_was'] - df['best_price']
    df['unit_price_numeric'] = df['unit_price'].str.extract(r'(\d+\.\d+)').astype(float)
    df['total_price'] = df['total_price'].str.replace('$', '', regex=False).astype(float)

    def calculate_quantity(row):
        if pd.notnull(row['unit_price_numeric']) and row['unit_price_numeric'] != 0:
            if 'each' in str(row['unit_price']):
                return round(row['total_price'] / row['unit_price_numeric'], 2), 'each'
            elif 'per 100mL' in str(row['unit_price']):
                return round(row['total_price'] / row['unit_price_numeric'] / 10, 2), 'L'
            elif 'per 100g' in str(row['unit_price']):
                return round(row['total_price'] / row['unit_price_numeric'] / 10, 2), 'Kg'
            elif ' kg' in str(row['unit_price']):
                return round(row['total_price'] / row['unit_price_numeric'], 2), 'Kg'
            elif ' L' in str(row['unit_price']):
                return round(row['total_price'] / row['unit_price_numeric'], 2), 'L'
        return None, None

    df[['quantity', 'unit_type']] = df.apply(lambda row: pd.Series(calculate_quantity(row)), axis=1)
    df['unit_price'] = df['unit_price_numeric']
    df.drop(columns=['unit_price_numeric', 'price_was'], inplace=True)

    df[['store_name', 'location']] = pd.DataFrame(random.choices(stores_list, k=len(df)))
    df['gender'] = random.choices(['Male', 'Female', 'Others'], k=len(df))
    df["payment_method"] = random.choices(["Cash", "Credit Card", "Debit Card", "Gift Card", "Afterpay"], k=len(df_iga))
    df['sub_category'] = ''
    df['transaction_id'] = ''
    df ['customer_id'] = ''
    item_names = df['item_name'].tolist()
    brands = []

    while item_names:
        first_item = item_names[0]
        words = first_item.split()
        if len(words[0]) != 1 or words[0].isnumeric():
            first_word = words[0]
        else: 
            first_word = words[0] + words[1]
        matching_items = [item for item in item_names if item.startswith(first_word)]
        
        if len(matching_items) == 1:
            brands.append(first_word)
            item_names.remove(matching_items[0])
            continue
        
        second_words = [item.split()[1] for item in matching_items if len(item.split()) > 1]
        word_counts = Counter(second_words)
        brand_name = [first_word]
        last_word_count = len(matching_items)

        for i in range(2, max(len(item.split()) for item in matching_items) + 1):
            next_words = [item.split()[i - 1] for item in matching_items if len(item.split()) >= i]
            next_word_counts = Counter(next_words)
            if next_word_counts and max(next_word_counts.values()) == last_word_count:
                brand_name.append(max(next_word_counts, key=next_word_counts.get))
                last_word_count = next_word_counts[max(next_word_counts, key=next_word_counts.get)]
            else:
                break

        full_brand = " ".join(brand_name)
        brands.append(full_brand)
        item_names = [item for item in item_names if not item.startswith(full_brand)]
        df['brand'] = df['item_name'].apply(lambda x: next((brand for brand in brands if x.startswith(brand)), None))

    df['date'] = [random_date() for _ in range(len(df))]
    df['category'] = df['item_name'].apply(lambda x: model.predict([x])[0])
    
    # Reorder columns
    column_order = [
        'item_id', 'item_name', 'brand', 'category', 
        'best_price', 'unit_price', 'total_price', 'discount',
        'quantity', 'unit_type', 'store_name', 'location', 'gender', 'date', 'sub_category', 'transaction_id', 'customer_id'
    ]
    df = df[column_order]
    return df

# Process IGA data
stores_list = extract_stores_from_pdf("IGA-STORES.pdf")
df_iga = clean_and_restructure_data('IGA_1.csv', stores_list)
df_iga.to_csv('IGA_1_updated_new.csv', index=False)
print("Updated IGA dataset saved as 'IGA_1_updated.csv'.")


Training Accuracy: 0.8016764574530214


FileNotFoundError: [Errno 2] No such file or directory: 'IGA-STORES.pdf'