### Importing pandas to explore dataset

In [1]:
import pandas as pd

In [2]:
# Since it is an Excel file, I am using read_excel to load file
retail_df = pd.read_excel("Online Retail.xlsx")

# Checking out details about the excel file
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


There are missing values with Description and CustomerID

In [3]:
retail_df.head(100)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
95,536378,22352,LUNCH BOX WITH CUTLERY RETROSPOT,6,2010-12-01 09:37:00,2.55,14688.0,United Kingdom
96,536378,21212,PACK OF 72 RETROSPOT CAKE CASES,120,2010-12-01 09:37:00,0.42,14688.0,United Kingdom
97,536378,21975,PACK OF 60 DINOSAUR CAKE CASES,24,2010-12-01 09:37:00,0.55,14688.0,United Kingdom
98,536378,21977,PACK OF 60 PINK PAISLEY CAKE CASES,24,2010-12-01 09:37:00,0.55,14688.0,United Kingdom


#### Dealing with missing Data

In [4]:
# Identifyng column with missing Data
columns_with_missing_data = retail_df.columns[retail_df.isna().any()].tolist()
columns_with_missing_data

['Description', 'CustomerID']

In [5]:
# Calculating the percentage of missing data
for col in columns_with_missing_data:
    print((retail_df[col].isna().sum()/len(retail_df[col]) * 100).round(2), f"% of missing values in {col}")

0.27 % of missing values in Description
24.93 % of missing values in CustomerID


### Implementing Strategies for handling missing Data

In [6]:
# For "Description" column, I am going to have to drop na bcos the percentage of missing value is small
retail_df = retail_df[retail_df['Description'].notna()]


In [None]:
# Inspecting InvoiceNo column to see if it is consistent
retail_df['InvoiceNo'].value_counts()

In [10]:
# Reformatting datasets in InvoiceNo column using Regex
retail_df['InvoiceNo'] = retail_df['InvoiceNo'].replace("[A-Z]","",regex=True)
retail_df['InvoiceNo'].value_counts()

InvoiceNo
573585    1114
581219     749
581492     731
580729     721
558475     705
          ... 
549545       1
549565       1
549580       1
573309       1
559309       1
Name: count, Length: 24446, dtype: int64

In [14]:
# Changing InvoiceNo datatype to it 
retail_df['InvoiceNo'] = retail_df['InvoiceNo'].astype(int)

In [17]:
# Using Multiple imputaton strategy because the missing data is missed at random(MAR)

from miceforest import ImputationKernel
mice_kernel = ImputationKernel(
    data=retail_df[["InvoiceNo", "CustomerID"]],
    save_all_iterations=True,
    random_state=42
)
mice_kernel.mice(2)

In [19]:
# The imputed data to make  Customer ID column complete
mice_file = mice_kernel.complete_data()
mice_file.info()

<class 'pandas.core.frame.DataFrame'>
Index: 540455 entries, 0 to 541908
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   InvoiceNo   540455 non-null  int32  
 1   CustomerID  540455 non-null  float64
dtypes: float64(1), int32(1)
memory usage: 10.3 MB


In [20]:
# Assigning mice file customerID to retail_df so it can be complete with no missing value
retail_df['CustomerID'] = mice_file['CustomerID']

In [21]:
# Confirming Completion of Data
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 540455 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    540455 non-null  int32         
 1   StockCode    540455 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     540455 non-null  int64         
 4   InvoiceDate  540455 non-null  datetime64[ns]
 5   UnitPrice    540455 non-null  float64       
 6   CustomerID   540455 non-null  float64       
 7   Country      540455 non-null  object        
dtypes: datetime64[ns](1), float64(2), int32(1), int64(1), object(3)
memory usage: 35.0+ MB


In [27]:
# Changing the datatype of Column ID to integer
retail_df['CustomerID'] = retail_df['CustomerID'].astype(int)

In [29]:
# Saving file
retail_df.to_csv("New Online Retail.csv")

In [30]:
retail_df.to_excel("New Online Retail.xlsx")