In [10]:
from zipfile import ZipFile
import pandas as pd
import os
#import ace_tools as tools
from prettytable import PrettyTable

### Add tables to compare

In [15]:
data_directory_raw = "Data/LargeZips/"
zip_file_raw = "transArchive_201001_201003.zip"  # Single zip file

data_directory_clean = "Data/clean-files/"
zip_file_clean = "transArchive_201001_201003_clean.csv"  # Single csv file


### Read in raw file


In [16]:
with ZipFile(data_directory_raw + zip_file_raw, 'r') as zf:
    # Get the list of files in the zip and assign the desired file to a variable
    file_name = zf.namelist()[0]  # Use variable to store the first file name (or choose the desired one)
    print(f"Reading file: {file_name}")
    
    # Open the file using the variable
    with zf.open(file_name) as f:
        df_raw = pd.read_csv(f, sep=',')

# Step 5: Print the shape of the DataFrame
print("Shape of the data:", df_raw.shape)

# Step 6: Get and print descriptive statistics
#print("Descriptive statistics:")
#print(df_raw.describe())

Reading file: transArchive_201001_201003.csv
Shape of the data: (2998330, 50)


### Read in Clean File

In [17]:
with open(data_directory_clean + zip_file_clean, 'r') as f:
    df_clean = pd.read_csv(f, sep=',')

# Step 5: Print the shape of the DataFrame
print("Shape of the data:", df_clean.shape)

# Step 6: Get and print descriptive statistics
#print("Descriptive statistics:")
#print(df_clean.describe())

Shape of the data: (2998330, 50)


In [None]:
# Step 1: Check if the number of columns matches
if df_clean.shape[1] != df_raw.shape[1]:
    print(f"Column mismatch: Cleaned DataFrame has {df_clean.shape[1]} columns, raw DataFrame has {df_raw.shape[1]} columns")

# Step 2: Check if column names match
if list(df_clean.columns) != list(df_raw.columns):
    print(f"Column name mismatch: Cleaned DataFrame columns are {list(df_clean.columns)}, raw DataFrame columns are {list(df_raw.columns)}")

# Step 3: Check if column data types match
if (df_clean.dtypes != df_raw.dtypes).any():
    print(f"Data type mismatch:\nCleaned DataFrame types:\n{df_clean.dtypes}\nRaw DataFrame types:\n{df_raw.dtypes}")


In [22]:
total_nulls = df_raw.isnull().sum().sum()

# Print the result
print(f"Total null values in df_raw: {total_nulls}")

Total null values in df_raw: 16472725


In [24]:
# Find the number of nulls in each column
nulls_by_column = df_raw.isnull().sum()

# Print the result
print("Null values by column:")
print(nulls_by_column)

Null values by column:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       389327
trans_status        673703
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt                0
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1094857
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2998330
staff                    0
numflag                  0
items

In [26]:
df_clean.dtypes

datetime            object
register_no          int64
emp_no               int64
trans_no             int64
upc                 object
description         object
trans_type          object
trans_subtype       object
trans_status        object
department           int64
quantity           float64
Scale                int64
cost               float64
unitPrice          float64
total              float64
regPrice           float64
altPrice           float64
tax                  int64
taxexempt            int64
foodstamp            int64
wicable              int64
discount           float64
memDiscount        float64
discountable         int64
discounttype         int64
voided               int64
percentDiscount    float64
ItemQtty           float64
volDiscType          int64
volume               int64
VolSpecial         float64
mixMatch             int64
matched              int64
memType            float64
staff                int64
numflag              int64
itemstatus           int64
t

In [25]:
# Find the number of nulls in each column
nulls_by_column = df_clean.isnull().sum()

# Print the result
print("Null values by column:")
print(nulls_by_column)

Null values by column:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       389327
trans_status        673703
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt                0
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1094857
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2998330
staff                    0
numflag                  0
items

In [23]:
total_nulls = df_clean.isnull().sum().sum()

# Print the result
print(f"Total null values in df_clean: {total_nulls}")

Total null values in df_clean: 16472725


In [21]:
# Function to collect column stats safely
def collect_column_stats(df):
    stats = {}
    for col in df.columns:
        stats[col] = {
            "Data Type": str(df[col].dtype),
            "Null Values": df[col].isnull().sum(),
            "Unique Values": df[col].nunique(),
            "Range": f"{df[col].min()} to {df[col].max()}"
                     if pd.api.types.is_numeric_dtype(df[col]) else "N/A"
        }
    return stats

# Collect stats for both DataFrames
clean_stats = collect_column_stats(df_clean)
raw_stats = collect_column_stats(df_raw)

# Create a PrettyTable instance for display
table = PrettyTable()
table.field_names = ["Column", "Metric", "Clean DataFrame", "Raw DataFrame", "Mismatch"]

# Add rows to the table for each column and metric
all_columns = set(df_clean.columns).union(df_raw.columns)  # Handle mismatched columns

for col in all_columns:
    for metric in ["Data Type", "Null Values", "Unique Values", "Range"]:
        clean_value = clean_stats.get(col, {}).get(metric, "N/A")
        raw_value = raw_stats.get(col, {}).get(metric, "N/A")
        
        # Flag if there's a mismatch between the two DataFrames
        mismatch = "Yes" if clean_value != raw_value else "No"
        
        table.add_row([col, metric, clean_value, raw_value, mismatch])

# Print the table
print(table)


+-----------------+---------------+---------------------------+---------------------------+----------+
|      Column     |     Metric    |      Clean DataFrame      |       Raw DataFrame       | Mismatch |
+-----------------+---------------+---------------------------+---------------------------+----------+
|   tenderstatus  |   Data Type   |           int64           |           int64           |    No    |
|   tenderstatus  |  Null Values  |             0             |             0             |    No    |
|   tenderstatus  | Unique Values |             1             |             1             |    No    |
|   tenderstatus  |     Range     |           0 to 0          |           0 to 0          |    No    |
|   discountable  |   Data Type   |           int64           |           int64           |    No    |
|   discountable  |  Null Values  |             0             |             0             |    No    |
|   discountable  | Unique Values |             5             |          