In [37]:
import csv

def read_sales_data(filename, file_encoder):
    try:
        data= []
        with open(filename, mode= "r", encoding= file_encoder, newline= "\n") as file:
            file_data= csv.reader(file, delimiter="|")
            next(file_data)   # skips header
            for row in file_data:
                if row and any(field.strip() for field in row):  # removes empty lines
                    data.append('|'.join(row))
        return data
    except UnicodeDecodeError:
        print(f"File cannot be decoded using {file_encoder}")
        return []
    except FileNotFoundError:
        print (f"File not found")
        return []
   

read_sales_data('data/sales_data.txt', 'utf-8')

['T018|2024-12-29|P107|USB Cable|8|173|C009|South',
 'T063|2024-12-07|P110|Laptop Charger|6|1,916|C022|East',
 'T075|2024-12-10|P106|Headphones|0|2826|C001|South',
 'T023|2024-12-09|P109|Wireless Mouse|9|523|C022|North',
 'T059|2024-12-29|P102|Mouse,Wireless|4|1056|C010|South',
 'T035|2024-12-08|P102|Mouse|4|431|C011|North',
 'T061|2024-12-10|P109|Wireless Mouse|2|775|C009|North',
 'T057|2024-12-15|P101|Laptop,Premium|10|81896|C004|North',
 'T034|2024-12-22|P107|USB Cable|6|324|C029|West',
 'T050|2024-12-02|P104|Monitor,LED|10|9997|C024|East',
 'T024|2024-12-25|P109|Wireless Mouse|5|1812|C011|North',
 'T004|2024-12-07|P109|Wireless Mouse|9|1359|C008|West',
 'T068|2024-12-02|P109|Wireless Mouse|6|1,692|C018|South',
 'T066|2024-12-06|P105|Webcam|8|4,259|C023|West',
 'T064|2024-12-16|P109|Wireless Mouse|5|604|C003|West',
 'T045|2024-12-26|P108|External Hard Drive|9|3802|C002|North',
 'T015|2024-12-30|P105|Webcam|9|2899|C022|East',
 'T055|2024-12-07|P105|Webcam,HD|6|2977|C009|West',
 'T072

In [39]:
a= read_sales_data('data/sales_data.txt', 'utf-8')

In [40]:
len(a)

80

In [49]:
a[:3]

['T018|2024-12-29|P107|USB Cable|8|173|C009|South',
 'T063|2024-12-07|P110|Laptop Charger|6|1,916|C022|East',
 'T075|2024-12-10|P106|Headphones|0|2826|C001|South']

In [50]:
# Task 1.2 - Parse and clean data

def parse_transactions(raw_lines):
    data= []
    for line in raw_lines:
        t_id, dt, p_id, p_name, qty, price, c_id, region= [f.strip() for f in line.split('|')]

        # Handles commas within product names by replacing commas with space....
        p_name_clean= p_name.replace(",", " ").strip()

        # Removes commas from numeric fields (eg. price = 45,000 or quantity = 10,000)
        qty_clean= qty.replace(",", " ").strip()
        price_clean= price.replace(",", " ").strip()

        try:
            qty= int(qty_clean)
            price= float(price_clean)
        except ValueError:
            continue
        
        data.append(
            {"TransactionID": t_id,
             "Date": dt,
             "ProductID": p_id,
             "ProductName": p_name_clean,
             "Quantity": qty,
             "UnitPrice": price,
             "CustomerID": c_id,
             "Region": region
             }
        )

    return data


In [51]:
parse_transactions(a)

[{'TransactionID': 'T018',
  'Date': '2024-12-29',
  'ProductID': 'P107',
  'ProductName': 'USB Cable',
  'Quantity': 8,
  'UnitPrice': 173.0,
  'CustomerID': 'C009',
  'Region': 'South'},
 {'TransactionID': 'T075',
  'Date': '2024-12-10',
  'ProductID': 'P106',
  'ProductName': 'Headphones',
  'Quantity': 0,
  'UnitPrice': 2826.0,
  'CustomerID': 'C001',
  'Region': 'South'},
 {'TransactionID': 'T023',
  'Date': '2024-12-09',
  'ProductID': 'P109',
  'ProductName': 'Wireless Mouse',
  'Quantity': 9,
  'UnitPrice': 523.0,
  'CustomerID': 'C022',
  'Region': 'North'},
 {'TransactionID': 'T059',
  'Date': '2024-12-29',
  'ProductID': 'P102',
  'ProductName': 'Mouse Wireless',
  'Quantity': 4,
  'UnitPrice': 1056.0,
  'CustomerID': 'C010',
  'Region': 'South'},
 {'TransactionID': 'T035',
  'Date': '2024-12-08',
  'ProductID': 'P102',
  'ProductName': 'Mouse',
  'Quantity': 4,
  'UnitPrice': 431.0,
  'CustomerID': 'C011',
  'Region': 'North'},
 {'TransactionID': 'T061',
  'Date': '2024-12-