In [1]:
from zipfile import ZipFile
import pandas as pd
import os
import ace_tools as tools


ModuleNotFoundError: No module named 'ace_tools'

### Add tables to compare

In [10]:
data_directory_raw = "Data/LargeZips/"
zip_file_raw = "transArchive_201701.zip"  # Single zip file

data_directory_clean = "Data/clean-files/"
zip_file_clean = "transArchive_201701_clean.csv"  # Single zip file


### Read in raw file


In [11]:
with ZipFile(data_directory_raw + zip_file_raw, 'r') as zf:
    # Get the list of files in the zip and assign the desired file to a variable
    file_name = zf.namelist()[0]  # Use variable to store the first file name (or choose the desired one)
    print(f"Reading file: {file_name}")
    
    # Open the file using the variable
    with zf.open(file_name) as f:
        df_raw = pd.read_csv(f, sep=',')

# Step 5: Print the shape of the DataFrame
print("Shape of the data:", df_raw.shape)

# Step 6: Get and print descriptive statistics
#print("Descriptive statistics:")
#print(df_raw.describe())

Reading file: transArchive_201701.csv


  df_raw = pd.read_csv(f, sep=',')


Shape of the data: (936740, 50)


### Read in Clean File

In [12]:
with open(data_directory_clean + zip_file_clean, 'r') as f:
    df_clean = pd.read_csv(f, sep=',')

# Step 5: Print the shape of the DataFrame
print("Shape of the data:", df_clean.shape)

# Step 6: Get and print descriptive statistics
#print("Descriptive statistics:")
#print(df_clean.describe())

Shape of the data: (936741, 50)


In [None]:
# Step 1: Check if the number of columns matches
if df_clean.shape[1] != df_raw.shape[1]:
    print(f"Column mismatch: Cleaned DataFrame has {df_clean.shape[1]} columns, raw DataFrame has {df_raw.shape[1]} columns")

# Step 2: Check if column names match
if list(df_clean.columns) != list(df_raw.columns):
    print(f"Column name mismatch: Cleaned DataFrame columns are {list(df_clean.columns)}, raw DataFrame columns are {list(df_raw.columns)}")

# Step 3: Check if column data types match
if (df_clean.dtypes != df_raw.dtypes).any():
    print(f"Data type mismatch:\nCleaned DataFrame types:\n{df_clean.dtypes}\nRaw DataFrame types:\n{df_raw.dtypes}")


In [14]:
# Function to collect comparison metrics from a DataFrame
def collect_stats(df, df_name):
    stats = {
        "Dataset": df_name,
        "Columns": [', '.join(df.columns)],  # Joining all column names as a string
        "Data Types": [df.dtypes.to_dict()],  # Store data types as a dictionary
        "Null Values": [df.isnull().sum().to_dict()],  # Store null counts as a dictionary
        "Range": [
            {
                col: f"{df[col].min()} to {df[col].max()}"
                if pd.api.types.is_numeric_dtype(df[col])
                else "N/A"
                for col in df.columns
            }
        ],  # Store range for numeric columns, "N/A" for non-numeric
        "Unique Values": [df.nunique().to_dict()],  # Store unique value counts
    }
    return pd.DataFrame(stats)  # Return as a DataFrame for easy comparison

# Collect stats for both DataFrames
clean_stats = collect_stats(df_clean, "Clean DataFrame")
raw_stats = collect_stats(df_raw, "Raw DataFrame")

# Concatenate both stats DataFrames for side-by-side comparison
comparison_table = pd.concat([clean_stats, raw_stats], ignore_index=True)

# Display the comparison table
tools.display_dataframe_to_user(name="DataFrame Comparison", dataframe=comparison_table)



--- Clean DataFrame ---

Column Names:
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']

Data Types:
datetime            object
register_no          int64
emp_no               int64
trans_no             int64
upc                 object
description         object
trans_type          object
trans_subtype       object
trans_status        object
department           int64
quantity           float64
Scale                int64
cost        