In [18]:
import pandas as pd

file_path = './../data/online_retail.csv'

d_types = {
    'Invoice': 'object',
    'StockCode': 'object',
    'Description': 'object',
    'Quantity': 'int64',
    'InvoiceDate': 'object',
    'Price': 'float64',
    'Customer ID': 'float64',
    'Country': 'object'
}

try:
    df = pd.read_csv(file_path, dtype=d_types, encoding="ISO-8859-1")
    print(f"Data Loaded successfully! \n {df.head()}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except ValueError as ve:
    print(f"Data type mismatch : {ve}")
except EncodingWarning as ew:
    print(f"Encoding warning : {ew}")
except Exception as e:
    print(f"Unexpected error : {e}")
    

Data Loaded successfully! 
   Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

           InvoiceDate  Price  Customer ID         Country  
0  2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3  2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4  2009-12-01 07:45:00   1.25      13085.0  United Kingdom  


In [19]:
print(f"\n=== DataFrame Information ===")
print(df.info())
    
print(f"\n=== Statistical Summary ===")
print(df.describe(include='all'))
    
print(f"\n=== Missing Values ===")
print(df.isnull().sum())

    


=== DataFrame Information ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB
None

=== Statistical Summary ===
        Invoice StockCode                         Description      Quantity  \
count   1067371   1067371                             1062989  1.067371e+06   
unique    53628      5305                                5698           NaN   
top      537434    85123A  WHITE HANGING HEART T-LIGHT HOLDER           NaN   
freq

In [20]:
duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows are: {duplicates_count}")
df = df.drop_duplicates()  
print(f"Duplicate values removed. New shape is: {df.shape}")


Number of duplicate rows are: 34335
Duplicate values removed. New shape is: (1033036, 8)


In [21]:
missing_cusid = df['Customer ID'].isnull().sum()
print(f"Number of rows missing customer id : {missing_cusid}")
df = df.dropna(subset=['Customer ID'])
print(f"Remove misssing customer id rows. New shape is : {df.shape}")

print(df.isnull().sum())


Number of rows missing customer id : 235151
Remove misssing customer id rows. New shape is : (797885, 8)
Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


In [22]:
cancellsd_order_count = df[df['Invoice'].str.startswith('C', na=False)].shape[0]
print(f"\nNumber of cancelled orders: {cancellsd_order_count}")
df = df[~df['Invoice'].str.startswith('C', na=False)]
print(f"Remove cancelled orders. New shape is : {df.shape}")


Number of cancelled orders: 18390
Remove cancelled orders. New shape is : (779495, 8)


In [23]:
zero_count = df[df['Price'] == 0].shape[0]
print(f"\nNumber of zero price rows: {zero_count}")
df = df[df['Price'] > 0]
print(f"Zero price rows removed. New shape: {df.shape}")


Number of zero price rows: 70
Zero price rows removed. New shape: (779425, 8)


In [24]:
non_product_codes = ['POST', 'M', 'BANK CHARGES', 'C2', 'DOT', 'CRUK'] 
non_product_count = df[df['StockCode'].isin(non_product_codes)].shape[0]
print(f"\nNumber of non product rows: {non_product_count}")
df = df[~df['StockCode'].isin(non_product_codes)]
print(f"Non product rows removed. New shape: {df.shape}")


Number of non product rows: 2779
Non product rows removed. New shape: (776646, 8)


In [26]:
zero_qty = df[df['Quantity'] <= 0].shape[0]
print(f"\nNumber of negative or zero Quantity: {zero_qty}")
df = df[df['Quantity'] > 0]
print(f"Negative or zero quantity rows removed. New shape: {df.shape}")


Number of negative or zero Quantity: 0
Negative or zero quantity rows removed. New shape: (776646, 8)
