In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
#load both datasets 
print("Loading raw dataset")
# accept .cv on desktop too
from pathlib import Path
possible_raw = [Path(r'C:/Users/cselue/Desktop/git/raw_data.csv'), Path(r'C:/Users/cselue/Desktop/git/raw_data.cv'), Path('data/raw_data.csv'), Path('raw_data.csv'), Path('raw_data.cv')]
raw_path = next((p for p in possible_raw if p.exists()), None)
if raw_path is None:
    raise FileNotFoundError('raw data not found in any expected path; please place raw_data.csv or raw_data.cv in the Desktop git folder or data folder')
print(f"Loading raw data from {raw_path}")
raw_data = pd.read_csv(raw_path, encoding='unicode_escape', parse_dates=['InvoiceDate'], dayfirst=True)
print("Loading incremental dataset")
incr_path = Path('data/incremental_data.csv')
if not incr_path.exists():
    print(f"{incr_path} not found — creating incremental from recent dates in raw data")
    incr_df = raw_data[raw_data['InvoiceDate'] >= '2011-11-01']
    incr_path.parent.mkdir(parents=True, exist_ok=True)
    incr_df.to_csv(incr_path, index=False)
    incremental_data = incr_df
else:
    incremental_data = pd.read_csv(incr_path, encoding='unicode_escape', parse_dates=['InvoiceDate'], dayfirst=True)

Loading raw dataset
Loading raw data from C:\Users\cselue\Desktop\git\raw_data.csv
Loading incremental dataset
data\incremental_data.csv not found — creating incremental from recent dates in raw data


  raw_data = pd.read_csv(raw_path, encoding='unicode_escape', parse_dates=['InvoiceDate'], dayfirst=True)


In [None]:
#display basic information about raw dataset
print("Raw Datset Overview")
print(f"Shape: {raw_data.shape}")
print("\nFirst 5 rows:")
display(raw_data.head())

print("\nDataset Info:")
raw_data.info()

print("\nStatistical Summary:")
display(raw_data.describe(include='all'))

Raw Datset Overview
Shape: (3, 7)

First 5 rows:


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country
0,1001,2011-10-31,1,10.0,12345,Widget A,United Kingdom
1,1002,2011-11-05,2,20.0,23456,Widget B,United Kingdom
2,1003,2011-11-20,1,30.0,34567,Widget C,United Kingdom



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    3 non-null      int64  
 1   InvoiceDate  3 non-null      object 
 2   Quantity     3 non-null      int64  
 3   UnitPrice    3 non-null      float64
 4   CustomerID   3 non-null      int64  
 5   Description  3 non-null      object 
 6   Country      3 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 300.0+ bytes

Statistical Summary:


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country
count,3.0,3,3.0,3.0,3.0,3,3
unique,,3,,,,3,1
top,,2011-10-31,,,,Widget A,United Kingdom
freq,,1,,,,1,3
mean,1002.0,,1.333333,20.0,23456.0,,
std,1.0,,0.57735,10.0,11111.0,,
min,1001.0,,1.0,10.0,12345.0,,
25%,1001.5,,1.0,15.0,17900.5,,
50%,1002.0,,1.0,20.0,23456.0,,
75%,1002.5,,1.5,25.0,29011.5,,


In [None]:
#display basic information about incremental dataset
print("INincremental Datset Overview")
print(f"Shape: {incremental_data.shape}")
print("\nFirst 5 rows:")
display(incremental_data.head())

print("\nDataset Info:")
incremental_data.info()

print("\nStatistical Summary:")
display(incremental_data.describe(include='all'))

INincremental Datset Overview
Shape: (2, 7)

First 5 rows:


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country
0,1002,2011-11-05,2,20.0,23456,Widget B,United Kingdom
1,1003,2011-11-20,1,30.0,34567,Widget C,United Kingdom



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    2 non-null      int64  
 1   InvoiceDate  2 non-null      object 
 2   Quantity     2 non-null      int64  
 3   UnitPrice    2 non-null      float64
 4   CustomerID   2 non-null      int64  
 5   Description  2 non-null      object 
 6   Country      2 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 244.0+ bytes

Statistical Summary:


Unnamed: 0,InvoiceNo,InvoiceDate,Quantity,UnitPrice,CustomerID,Description,Country
count,2.0,2,2.0,2.0,2.0,2,2
unique,,2,,,,2,1
top,,2011-11-05,,,,Widget B,United Kingdom
freq,,1,,,,1,2
mean,1002.5,,1.5,25.0,29011.5,,
std,0.707107,,0.707107,7.071068,7856.663446,,
min,1002.0,,1.0,20.0,23456.0,,
25%,1002.25,,1.25,22.5,26233.75,,
50%,1002.5,,1.5,25.0,29011.5,,
75%,1002.75,,1.75,27.5,31789.25,,


In [None]:
#identify and document data quality issues
print("Data Quality Issues")

print("\n1. MISSING VALUES:")
print("Raw Data:")
print(raw_data.isnull().sum())
print("\nIncremental Data:")
print(incremental_data.isnull().sum())

print("\n2. DUPLICATE RECORDS:")
print(f"Raw Data duplicates: {raw_data.duplicated().sum()}")
print(f"Incremental Data duplicates: {incremental_data.duplicated().sum()}")

print("\n3. DATA TYPE ISSUES:")
print("Raw Data dtypes:")
print(raw_data.dtypes)
print("\nIncremental Data dtypes:")
print(incremental_data.dtypes)

Data Quality Issues

1. MISSING VALUES:
Raw Data:
InvoiceNo      0
InvoiceDate    0
Quantity       0
UnitPrice      0
CustomerID     0
Description    0
Country        0
dtype: int64

Incremental Data:
InvoiceNo      0
InvoiceDate    0
Quantity       0
UnitPrice      0
CustomerID     0
Description    0
Country        0
dtype: int64

2. DUPLICATE RECORDS:
Raw Data duplicates: 0
Incremental Data duplicates: 0

3. DATA TYPE ISSUES:
Raw Data dtypes:
InvoiceNo        int64
InvoiceDate     object
Quantity         int64
UnitPrice      float64
CustomerID       int64
Description     object
Country         object
dtype: object

Incremental Data dtypes:
InvoiceNo        int64
InvoiceDate     object
Quantity         int64
UnitPrice      float64
CustomerID       int64
Description     object
Country         object
dtype: object


In [None]:
#check for inconsistent data in key columns
print("\n4. Data Consistency Check:")
if 'Country' in raw_data.columns:
    print(f"Countries: {raw_data['Country'].nunique()}")
if 'Invoice' in raw_data.columns:
    print(f"Invoice types: {raw_data['Invoice'].astype(str).str[0].unique()}")  # Check if starts with C (cancellations)

print("\nNegative quantities:")
print(f"Raw Data: {(raw_data['Quantity'] < 0).sum()}")
print(f"Incremental Data: {(incremental_data['Quantity'] < 0).sum()}")

print("\nZero or negative unit prices:")
print(f"Raw Data: {(raw_data['UnitPrice'] <= 0).sum()}")
print(f"Incremental Data: {(incremental_data['UnitPrice'] <= 0).sum()}")


4. Data Consistency Check:
Countries: 1

Negative quantities:
Raw Data: 0
Incremental Data: 0

Zero or negative unit prices:
Raw Data: 0
Incremental Data: 0


In [None]:
#merge datasets (append incremental to raw data)
print("Dataset Merging")
print("Merging strategy: Appending incremental data to raw data as new records")
combined_data = pd.concat([raw_data, incremental_data], ignore_index=True)
print(f"Combined dataset shape: {combined_data.shape}")

Dataset Merging
Merging strategy: Appending incremental data to raw data as new records
Combined dataset shape: (5, 7)


In [None]:
#remove exact duplicates after merge
initial_count = len(combined_data)
combined_data = combined_data.drop_duplicates()
final_count = len(combined_data)
print(f"Duplicates removed after merge: {initial_count - final_count}")

Duplicates removed after merge: 2


In [None]:
#save validated datasets
print("Saving validated datasets")
combined_data.to_csv('data/validated_combined.csv', index=False)
raw_data.to_csv('data/validated_raw.csv', index=False)
incremental_data.to_csv('data/validated_incremental.csv', index=False)

print("Extract phase completed successfully!")

Saving validated datasets
Extract phase completed successfully!
