In [33]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [34]:
# Load validated datasets
print("Loading validated datasets...")
from pathlib import Path
data_dir = Path('data')
validated_combined = data_dir / 'validated_combined.csv'
validated_incremental = data_dir / 'validated_incremental.csv'

def _ensure_validated_files():
    # If validated files exist, nothing to do
    if validated_combined.exists() and validated_incremental.exists():
        return
    print("One or more validated files missing — attempting to rebuild from raw/incremental sources")
    # Try to load existing incremental or raw sources
    possible_raw = [Path(r'C:/Users/cselue/Desktop/git/raw_data.csv'), Path(r'C:/Users/cselue/Desktop/git/raw_data.cv'), Path('data/raw_data.csv'), Path('raw_data.csv'), Path('raw_data.cv')]
    raw_path = next((p for p in possible_raw if p.exists()), None)
    incr_path = Path('data/incremental_data.csv') if Path('data/incremental_data.csv').exists() else None
    if raw_path is None and incr_path is None:
        raise FileNotFoundError('No raw or incremental sources found to rebuild validated data. Please place raw_data.csv or incremental_data.csv in expected locations.')
    # Load raw if available
    if raw_path is not None:
        ext = raw_path.suffix.lower()
        if ext in ['.csv', '.cv']:
            raw_df = pd.read_csv(raw_path, encoding='unicode_escape', parse_dates=['InvoiceDate'], dayfirst=True)
        else:
            raise ValueError(f'Unsupported raw file extension: {ext}')
    else:
        raw_df = None
    # Load incremental if available else create from raw
    if incr_path is not None:
        incr_df = pd.read_csv(incr_path, encoding='unicode_escape', parse_dates=['InvoiceDate'], dayfirst=True)
    else:
        incr_df = raw_df[raw_df['InvoiceDate'] >= '2011-11-01']
        data_dir.mkdir(parents=True, exist_ok=True)
        incr_df.to_csv(incr_path, index=False)
    # Create validated combined by concatenating and dropping duplicates
    combined = pd.concat([raw_df, incr_df], ignore_index=True).drop_duplicates() if raw_df is not None else incr_df.drop_duplicates()
    data_dir.mkdir(parents=True, exist_ok=True)
    combined.to_csv(validated_combined, index=False)
    incr_df.to_csv(validated_incremental, index=False)
    print('Rebuilt validated files: data/validated_combined.csv and data/validated_incremental.csv')

# ensure validated files exist
_ensure_validated_files()

# load files
full_data = pd.read_csv(validated_combined, encoding='unicode_escape')
incremental_data = pd.read_csv(validated_incremental, encoding='unicode_escape')

print(f"Full data shape: {full_data.shape}")
print(f"Incremental data shape: {incremental_data.shape}")

Loading validated datasets...
Full data shape: (3, 7)
Incremental data shape: (2, 7)


In [35]:
# Display original data sample
print("Original Data Sample")
display(full_data.head(3))
print("\nOriginal data types:")
print(full_data.dtypes)


Original Data Sample


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country
0,1001,2011-10-31,1,10.0,12345,Widget A,United Kingdom
1,1002,2011-11-05,2,20.0,23456,Widget B,United Kingdom
2,1003,2011-11-20,1,30.0,34567,Widget C,United Kingdom



Original data types:
InvoiceNo        int64
InvoiceDate     object
Quantity         int64
UnitPrice      float64
CustomerID       int64
Description     object
Country         object
dtype: object


In [36]:
# Handle Missing Values
print("Handling Missing Values")
print("Before transformation - Missing values:")
print(full_data.isnull().sum())


Handling Missing Values
Before transformation - Missing values:
InvoiceNo      0
InvoiceDate    0
Quantity       0
UnitPrice      0
CustomerID     0
Description    0
Country        0
dtype: int64


In [37]:
#Remove Duplicates 
print("Removing Duplicates")
initial_count = len(full_data)
full_data = full_data.drop_duplicates()
final_count = len(full_data)
print(f"Duplicates removed: {initial_count - final_count}")


Removing Duplicates
Duplicates removed: 0


In [38]:
# Standardize Data Types
print("Standardizing Data Types")
print("Before - Data types:")
print(full_data[['InvoiceDate', 'CustomerID', 'Quantity', 'UnitPrice']].dtypes)

Standardizing Data Types
Before - Data types:
InvoiceDate     object
CustomerID       int64
Quantity         int64
UnitPrice      float64
dtype: object


In [39]:
# Convert InvoiceDate to datetime
full_data['InvoiceDate'] = pd.to_datetime(full_data['InvoiceDate'])
incremental_data['InvoiceDate'] = pd.to_datetime(incremental_data['InvoiceDate'])

# Convert CustomerID to string (for those that are not 'Unknown')
full_data['CustomerID'] = full_data['CustomerID'].astype(str)
incremental_data['CustomerID'] = incremental_data['CustomerID'].astype(str)

# Ensure numeric columns are proper type
full_data['Quantity'] = pd.to_numeric(full_data['Quantity'], errors='coerce')
full_data['UnitPrice'] = pd.to_numeric(full_data['UnitPrice'], errors='coerce')

print("\nAfter - Data types:")
print(full_data[['InvoiceDate', 'CustomerID', 'Quantity', 'UnitPrice']].dtypes)


After - Data types:
InvoiceDate    datetime64[ns]
CustomerID             object
Quantity                int64
UnitPrice             float64
dtype: object


In [40]:
# Data Enrichment - Add Derived Columns 
print("Adding Derived Columns")

# Calculate Total Amount
full_data['TotalAmount'] = full_data['Quantity'] * full_data['UnitPrice']
incremental_data['TotalAmount'] = incremental_data['Quantity'] * incremental_data['UnitPrice']

# Extract date components
full_data['Year'] = full_data['InvoiceDate'].dt.year
full_data['Month'] = full_data['InvoiceDate'].dt.month
full_data['DayOfWeek'] = full_data['InvoiceDate'].dt.day_name()
full_data['Hour'] = full_data['InvoiceDate'].dt.hour

# Add similar for incremental data
incremental_data['Year'] = incremental_data['InvoiceDate'].dt.year
incremental_data['Month'] = incremental_data['InvoiceDate'].dt.month
incremental_data['DayOfWeek'] = incremental_data['InvoiceDate'].dt.day_name()
incremental_data['Hour'] = incremental_data['InvoiceDate'].dt.hour

print("New columns added: TotalAmount, Year, Month, DayOfWeek, Hour")
display(full_data[['InvoiceDate', 'Quantity', 'UnitPrice', 'TotalAmount', 'Year', 'Month', 'DayOfWeek']].head())

Adding Derived Columns
New columns added: TotalAmount, Year, Month, DayOfWeek, Hour


Unnamed: 0,InvoiceDate,Quantity,UnitPrice,TotalAmount,Year,Month,DayOfWeek
0,2011-10-31,1,10.0,10.0,2011,10,Monday
1,2011-11-05,2,20.0,40.0,2011,11,Saturday
2,2011-11-20,1,30.0,30.0,2011,11,Sunday


In [41]:
# Create Sales Brackets 
print("Creating Sales Brackets")

def categorize_sales(amount):
    if amount <= 0:
        return 'Zero/Negative'
    elif amount <= 10:
        return 'Small (≤£10)'
    elif amount <= 50:
        return 'Medium (£11-£50)'
    elif amount <= 100:
        return 'Large (£51-£100)'
    else:
        return 'Very Large (>£100)'

full_data['SalesBracket'] = full_data['TotalAmount'].apply(categorize_sales)
incremental_data['SalesBracket'] = incremental_data['TotalAmount'].apply(categorize_sales)

print("Sales brackets distribution:")
print(full_data['SalesBracket'].value_counts())


Creating Sales Brackets
Sales brackets distribution:
SalesBracket
Medium (£11-£50)    2
Small (≤£10)        1
Name: count, dtype: int64


In [42]:
# Filtering - Remove Invalid Records 
print("Filtering Invalid Records")
initial_count = len(full_data)

# Remove records with negative quantities (cancellations) for this analysis
full_data = full_data[full_data['Quantity'] > 0]
incremental_data = incremental_data[incremental_data['Quantity'] > 0]

# Remove records with zero or negative unit prices
full_data = full_data[full_data['UnitPrice'] > 0]
incremental_data = incremental_data[incremental_data['UnitPrice'] > 0]

final_count = len(full_data)
print(f"Records removed: {initial_count - final_count}")
print(f"Remaining valid records: {final_count}")

Filtering Invalid Records
Records removed: 0
Remaining valid records: 3


In [43]:
# Standardization - Clean Text Data 
print("Standardizing Text Data")

# Clean description text
full_data['Description'] = full_data['Description'].str.strip().str.title()
incremental_data['Description'] = incremental_data['Description'].str.strip().str.title()

# Standardize country names
full_data['Country'] = full_data['Country'].str.strip().str.title()
incremental_data['Country'] = incremental_data['Country'].str.strip().str.title()

print("Sample of cleaned descriptions:")
print(full_data['Description'].head())

Standardizing Text Data
Sample of cleaned descriptions:
0    Widget A
1    Widget B
2    Widget C
Name: Description, dtype: object


In [44]:
# Display final transformed data
print("Final Transformed Data Sample and Summary")
print("Sample of transformed data:")
display(full_data.head(5))

print("\nTransformed data info:")
full_data.info()

print("\nSummary statistics for numeric columns:")
display(full_data[['Quantity', 'UnitPrice', 'TotalAmount']].describe())

Final Transformed Data Sample and Summary
Sample of transformed data:


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country,TotalAmount,Year,Month,DayOfWeek,Hour,SalesBracket
0,1001,2011-10-31,1,10.0,12345,Widget A,United Kingdom,10.0,2011,10,Monday,0,Small (≤£10)
1,1002,2011-11-05,2,20.0,23456,Widget B,United Kingdom,40.0,2011,11,Saturday,0,Medium (£11-£50)
2,1003,2011-11-20,1,30.0,34567,Widget C,United Kingdom,30.0,2011,11,Sunday,0,Medium (£11-£50)



Transformed data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   InvoiceNo     3 non-null      int64         
 1   InvoiceDate   3 non-null      datetime64[ns]
 2   Quantity      3 non-null      int64         
 3   UnitPrice     3 non-null      float64       
 4   CustomerID    3 non-null      object        
 5   Description   3 non-null      object        
 6   Country       3 non-null      object        
 7   TotalAmount   3 non-null      float64       
 8   Year          3 non-null      int32         
 9   Month         3 non-null      int32         
 10  DayOfWeek     3 non-null      object        
 11  Hour          3 non-null      int32         
 12  SalesBracket  3 non-null      object        
dtypes: datetime64[ns](1), float64(2), int32(3), int64(2), object(5)
memory usage: 408.0+ bytes

Summary statistics for numer

Unnamed: 0,Quantity,UnitPrice,TotalAmount
count,3.0,3.0,3.0
mean,1.333333,20.0,26.666667
std,0.57735,10.0,15.275252
min,1.0,10.0,10.0
25%,1.0,15.0,20.0
50%,1.0,20.0,30.0
75%,1.5,25.0,35.0
max,2.0,30.0,40.0


In [45]:
# Save transformed datasets
print("Saving transformed datasets")
from pathlib import Path
transformed_dir = Path('transformed')
transformed_dir.mkdir(parents=True, exist_ok=True)
full_data.to_csv(transformed_dir / 'transformed_full.csv', index=False)
incremental_data.to_csv(transformed_dir / 'transformed_incremental.csv', index=False)

print("Transform phase completed successfully")
print(f"Final dataset shape: {full_data.shape}")

Saving transformed datasets
Transform phase completed successfully
Final dataset shape: (3, 13)
