In [1]:
import os
import pandas as pd
import json

# Define base schemas for each type of data
BASE_CUSTOMER_SCHEMA = {
    "Customer_ID": "",
    "Last_Used_Platform": "",
    "Is_Blocked": "",
    "Created_At": "",
    "Language": "",
    "Outstanding_Amount": "",
    "Loyalty_Points": "",
    "Number_of_employees": ""
}

BASE_ORDERS_SCHEMA = {
    "Order_ID": "",
    "Order_Status": "",
    "Category_Name": ""
    # Add other fields as per your requirements
}

def load_csv(file_path):
    # Load CSV file into a pandas DataFrame
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading CSV file '{file_path}': {e}")
        return None

def load_json(file_path):
    # Load JSON file into a Python dictionary
    try:
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            return data
    except Exception as e:
        print(f"Error loading JSON file '{file_path}': {e}")
        return None

def validate_schema(data, base_schema):
    # Validate if all keys in base_schema are present in data
    return all(key in data for key in base_schema)

def load_data_from_folder(folder_path):
    # Get list of files with their modification times
    files = [(f, os.path.getmtime(os.path.join(folder_path, f))) for f in os.listdir(folder_path)]
    files.sort(key=lambda x: x[1], reverse=True)
    
    num_files_to_process = 3  # Adjust as needed
    for file_name, _ in files[:num_files_to_process]:
        file_path = os.path.join(folder_path, file_name)
        
        if file_name.endswith('.csv'):
            # Load CSV file
            csv_data = load_csv(file_path)
            if csv_data is not None:
                if validate_schema(csv_data.columns.tolist(), BASE_CUSTOMER_SCHEMA.keys()):
                    print(f"CSV file '{file_name}' schema validated successfully.")
                    # Process or store the CSV data here
                else:
                    print(f"Schema validation failed for CSV file '{file_name}'.")
        
        elif file_name.endswith('.json'):
            # Load JSON file
            json_data = load_json(file_path)
            if json_data is not None:
                if validate_schema(json_data, BASE_ORDERS_SCHEMA):
                    print(f"JSON file '{file_name}' schema validated successfully.")
                    # Process or store the JSON data here
                else:
                    print(f"Schema validation failed for JSON file '{file_name}'.")
        
        else:
            print(f"Unsupported file format: '{file_name}'")

def main():
    folder_path = 'C:/Users/kobby/OneDrive/Documents/Data Analysis/trestle_project/data-pipeline-for-a-large-e-commerce-platform/data_dir'  # Specify your folder path here
    if not os.path.isdir(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return

    load_data_from_folder(folder_path)

if __name__ == "__main__":
    main()


Schema validation failed for CSV file 'Market 1 Orders.csv'.
Schema validation failed for CSV file 'Market 1 Deliveries.csv'.
Schema validation failed for JSON file 'Market 1 Customers.json'.


  df = pd.read_csv(file_path)
