
# ADA: Wedge Process

This repository builds on our Wedge Exploration exercise. This exercise will help you carry out the Wedge project
at an A level. 

You'll write code that carries out the following steps: 

1. Create an empty data frame called `wedge_summary` with the following columns: file_name, num_rows, num_cards, num_dates
1. Iterate over the zip files that hold the Wedge transaction files
2. Unzip each file one at a time (so this will be part of a `for` loop)
3. Use the CSV sniffer to determine the delimiter and whether or not there is a header row. 
4. Read, or attempt to read, the file into a Pandas dataframe, using the delimiter and handling headers correctly. 
5. For each file, store a row in `wedge_summary` that holds the values listed above. `num_cards` should be the unique card numbers in the file and `num_dates` should be the number of dates.  


import os

In [1]:
import os
import io
from zipfile import ZipFile
import pandas as pd
import csv

In [2]:
wedge_summary = pd.DataFrame(columns=['file_name', 'num_rows', 'num_cards', 'num_dates'])


In [3]:
for file in os.listdir("data/SmallZip"):
    print(file)

transArchive_201001_201003_small.zip
transArchive_201004_201006_small.zip
transArchive_201007_201009_small.zip
transArchive_201010_201012_small.zip
transArchive_201101_201103_small.zip
transArchive_201104_small.zip
transArchive_201105_small.zip
transArchive_201106_small.zip
transArchive_201107_201109_small.zip
transArchive_201110_201112_small.zip
transArchive_201201_201203_inactive_small.zip
transArchive_201201_201203_small.zip
transArchive_201204_201206_inactive_small.zip
transArchive_201204_201206_small.zip
transArchive_201207_201209_inactive_small.zip
transArchive_201207_201209_small.zip
transArchive_201210_201212_inactive_small.zip
transArchive_201210_201212_small.zip
transArchive_201301_201303_inactive_small.zip
transArchive_201301_201303_small.zip
transArchive_201304_201306_inactive_small.zip
transArchive_201304_201306_small.zip
transArchive_201307_201309_inactive_small.zip
transArchive_201307_201309_small.zip
transArchive_201310_201312_inactive_small.zip
transArchive_201310_2013

In [4]:
data_directory = "data/SmallZip/"
zip_files = os.listdir("data/SmallZip")

In [5]:
for file in zip_files :
    with ZipFile(data_directory + file,'r') as zf :  
        print(zf.namelist())

['transArchive_201001_201003_small.csv']
['transArchive_201004_201006_small.csv']
['transArchive_201007_201009_small.csv']
['transArchive_201010_201012_small.csv']
['transArchive_201101_201103_small.csv']
['transArchive_201104_small.csv']
['transArchive_201105_small.csv']
['transArchive_201106_small.csv']
['transArchive_201107_201109_small.csv']
['transArchive_201110_201112_small.csv']
['transArchive_201201_201203_inactive_small.csv']
['transArchive_201201_201203_small.csv']
['transArchive_201204_201206_inactive_small.csv']
['transArchive_201204_201206_small.csv']
['transArchive_201207_201209_inactive_small.csv']
['transArchive_201207_201209_small.csv']
['transArchive_201210_201212_inactive_small.csv']
['transArchive_201210_201212_small.csv']
['transArchive_201301_201303_inactive_small.csv']
['transArchive_201301_201303_small.csv']
['transArchive_201304_201306_inactive_small.csv']
['transArchive_201304_201306_small.csv']
['transArchive_201307_201309_inactive_small.csv']
['transArchive_

In [None]:
### How is this different compared to below. I didn't extract below and need to??

#with ZipFile(data_directory+file) as my_zip:
    #print(my_zip.namelist())
    #for zipped_file in my_zip.namelist():
        #x = my_zip.extract(zipped_file)

In [6]:
#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Iteraate over each file inside the current zip file
        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")

            for idx, line in enumerate(input_file) :
                print(line)
                if idx > 2 :
                    break
            input_file.close()
        print("\n")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"

"2010-01-01 09:04:09","5","17","2","0005385200400","Medium Salsa 16oz GMG","I"," "," ","1","1","0","2.6480","2.9900","2.9900","3.9900","0.0000","0","0","1","0","1.0000","0.0000","1","1","0","0.00000000","1","0","0","0.0000","0","0",,"0","2","0","0","0","0",NULL,"0",NULL,,"0","3","1","0","0","1"

"2010-01-01 09:04:12","5","17","2","0020631400000","ChickenBreastSkinlessBoneless","I"," "," ","13","1","0","4.3700","7.4100","7.4100","5.6900

## Sniffing out the Delimiter 

In [7]:
delimiters = dict() 

# Start by reading in all the files again.

#current_zf = zip_files[0]
for current_zf in zip_files :
# Open the current zf
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()
        
        # Iteraate over each file inside the current zip file
        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            print(" ".join(["For",
                           file_name,
                           "the delimiter is",
                           dialect.delimiter
                           ]))

            input_file.close() # tidy up

For transArchive_201001_201003_small.csv the delimiter is ,
For transArchive_201004_201006_small.csv the delimiter is ,
For transArchive_201007_201009_small.csv the delimiter is ,
For transArchive_201010_201012_small.csv the delimiter is ,
For transArchive_201101_201103_small.csv the delimiter is ,
For transArchive_201104_small.csv the delimiter is ,
For transArchive_201105_small.csv the delimiter is ,
For transArchive_201106_small.csv the delimiter is ,
For transArchive_201107_201109_small.csv the delimiter is ,
For transArchive_201110_201112_small.csv the delimiter is ,
For transArchive_201201_201203_inactive_small.csv the delimiter is ;
For transArchive_201201_201203_small.csv the delimiter is ,
For transArchive_201204_201206_inactive_small.csv the delimiter is ;
For transArchive_201204_201206_small.csv the delimiter is ,
For transArchive_201207_201209_inactive_small.csv the delimiter is ;
For transArchive_201207_201209_small.csv the delimiter is ,
For transArchive_201210_201212_ina

## Checking for Headers 

In [8]:
# Create a function to look for headers

def is_header_row(first_row, second_row):
    """
    Function to determine if the first row is a header based on the types of data in the rows.
    Returns True if the first row looks like a header.
    """
    # Check if the first row contains non-numeric values and second row contains numeric values
    for first, second in zip(first_row, second_row):
        # If the first value is not a number but the second value is, it's likely a header
        if first.isdigit() == False and second.isdigit() == True:
            return True
    return False

In [9]:
headers = dict()
delimiters = dict() 

for current_zf in zip_files :
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()

        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            print(" ".join(["For",
                           file_name,
                           "the delimiter is",
                           dialect.delimiter
                           ]))
            
            this_delimiter = delimiters[file_name]

            # Read the first two lines of the file
            first_line = input_file.readline().strip().split(this_delimiter)
            second_line = input_file.readline().strip().split(this_delimiter)
            
            # Check if the first line is a header row
            has_header = is_header_row(first_line, second_line)
            
            # Print if it has a header
            if has_header:
                print(f"File {file_name} has a header row.")
            else:
                print(f"File {file_name} does NOT have a header row.")
            
            # Append result to headers dictionary
            headers[file_name] = has_header

            input_file.close() # tidy up

For transArchive_201001_201003_small.csv the delimiter is ,
File transArchive_201001_201003_small.csv does NOT have a header row.
For transArchive_201004_201006_small.csv the delimiter is ,
File transArchive_201004_201006_small.csv does NOT have a header row.
For transArchive_201007_201009_small.csv the delimiter is ,
File transArchive_201007_201009_small.csv does NOT have a header row.
For transArchive_201010_201012_small.csv the delimiter is ,
File transArchive_201010_201012_small.csv does NOT have a header row.
For transArchive_201101_201103_small.csv the delimiter is ,
File transArchive_201101_201103_small.csv does NOT have a header row.
For transArchive_201104_small.csv the delimiter is ,
File transArchive_201104_small.csv does NOT have a header row.
For transArchive_201105_small.csv the delimiter is ,
File transArchive_201105_small.csv does NOT have a header row.
For transArchive_201106_small.csv the delimiter is ,
File transArchive_201106_small.csv has a header row.
For transArc

## Read through Zip files, find delimeters, find headers

In [10]:
headers = dict()
delimiters = dict() 

for current_zf in zip_files :
    with ZipFile(data_directory + current_zf,'r') as zf :
        zipped_files = zf.namelist()

        for file_name in zipped_files :
            # Open and wrap it to read as text
            input_file = io.TextIOWrapper(zf.open(file_name, 'r'), encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            print(" ".join(["For",
                           file_name,
                           "the delimiter is",
                           dialect.delimiter
                           ]))
            
            this_delimiter = delimiters[file_name]

            # Read the first two lines of the file
            first_line = input_file.readline().strip().split(this_delimiter)
            second_line = input_file.readline().strip().split(this_delimiter)
            
            # Check if the first line is a header row
            has_header = is_header_row(first_line, second_line)
            
            # Print if it has a header
            if has_header:
                print(f"File {file_name} has a header row.")
                header_option = 0  # The first line is a header
            else:
                print(f"File {file_name} does NOT have a header row.")
                header_option = None  # No header present

            # Append result to headers dictionary
            headers[file_name] = has_header

            #Read into a dataframe
            try:
                # Read into a DataFrame, assuming the first line might be a header
                df = pd.read_csv(input_file, delimiter=this_delimiter, header = 0)

                # Print the first few rows of the DataFrame to confirm
                print(f"DataFrame from {file_name}:")
                print(df.head())  # Show the first 5 rows of the DataFrame

            except Exception as e:
                # Handle the case where reading fails
                print(f"Error reading {file_name}: {e}")

            input_file.close() # tidy up


For transArchive_201001_201003_small.csv the delimiter is ,
File transArchive_201001_201003_small.csv does NOT have a header row.
DataFrame from transArchive_201001_201003_small.csv:
   2010-01-01 09:04:15  5  17  2  0002433551303  \
0  2010-01-01 09:04:18  5  17  2  0004850000139   
1  2010-01-01 09:04:21  5  17  2  0004850000139   
2  2010-01-01 09:04:23  5  17  2  0082704802560   
3  2010-01-01 09:04:28  5  17  2  0009396681120   
4  2010-01-01 09:04:34  5  17  2  0000000000051   

        Taco Seasoning 1.4oz Bear  I     .1  1  ...  NULL  0.16  NULL.1  \
0  Orange Juice/Some Pulp 64oz Tr  I         4  ...   NaN     0     NaN   
1  Orange Juice/Some Pulp 64oz Tr  I         4  ...   NaN     0     NaN   
2     Shredded Mexican Cheese 8oz  I         5  ...   NaN     0     NaN   
3           Eggs O.Large dozen OV  I         4  ...   NaN     0     NaN   
4                  BANANA Organic  I         2  ...   NaN     0     NaN   

   Unnamed: 43  0.17  3  1.5  0.18  0.19   4  
0          N

In [16]:
print(df)

   2017-01-01 08:36:52  51  94   9              0        Change  T   CA  \
0  2017-01-01 09:34:51  51  94  17       DISCOUNT      Discount  I  NaN   
1  2017-01-01 10:16:19  51  94  39  0020114500000  Gingerbread   I        
2  2017-01-01 08:35:55  51  94   8              0        Change  T   CA   
3  2017-01-01 09:24:18  51  94  14            TAX           Tax  A  NaN   
4  2017-01-01 10:09:00  51  94  29              0          Cash  T   CA   

  Unnamed: 8  0.1  ...  \N.4  0.19  \N.5  Unnamed: 43  0.20  19341  1  3 0.21  \
0        NaN    0  ...    \N     0    \N          NaN     0      3  1  3    0   
1               8  ...    \N     0     0          NaN     0  49032  1  3    0   
2        NaN    0  ...    \N     0    \N          NaN     0  20935  1  3    0   
3        NaN    0  ...    \N     0    \N          NaN     0  14140  1  3    0   
4        NaN    0  ...    \N     0    \N          NaN     0  16938  1  3    0   

   13  
0   8  
1   3  
2  11  
3  17  
4  11  

[5 rows x 50 