In [39]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Replace 'your_file.xlsx' with the actual file path
file_path = 'data/kpmg.xlsx'

# Use the ExcelFile class to read the Excel file
xls = pd.ExcelFile(file_path)

# List the sheet names in the Excel file
sheet_names = xls.sheet_names

# Create DataFrames for each sheet
dataframes = {}  # Dictionary to store DataFrames

for sheet_name in sheet_names:
    dataframes[sheet_name] = pd.read_excel(xls, sheet_name)

# Now you have separate DataFrames for each sheet
# access the dataframes using their sheet names
customer_data = dataframes['Transactions']
demographic_data = dataframes['CustomerDemographic']
transaction_data = dataframes['CustomerAddress']

In [40]:
# Reading the first five rows of the customer dataset
customer_data.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [41]:
# Reading the first five rows of the demographic dataset
demographic_data.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0


In [42]:
# Get unique values in the 'gender' column and count the occurrences of each unique value
gender_counts = demographic_data['gender'].unique()
counts = demographic_data['gender'].value_counts()

# Print the unique values and their respective counts
print("Unique Gender Values:")
print(gender_counts)
print("\nGender Value Counts:")
print(counts)

Unique Gender Values:
['F' 'Male' 'Female' 'U' 'Femal' 'M']

Gender Value Counts:
Female    2037
Male      1872
U           88
F            1
Femal        1
M            1
Name: gender, dtype: int64


In [43]:
# Create a mapping dictionary to standardize gender values
gender_mapping = {
    'F': 'Female',
    'Male': 'Male',
    'Female': 'Female',
    'U': 'Unspecified',
    'Femal': 'Female',
    'M': 'Male'
}

# Use the mapping dictionary to replace gender values
demographic_data['gender'] = demographic_data['gender'].replace(gender_mapping)

# Check the corrected unique values and their counts
gender_counts_corrected = demographic_data['gender'].unique()
counts_corrected = demographic_data['gender'].value_counts()

# Print the corrected unique values and their counts
print("Unique Gender Values (Corrected):")
print(gender_counts_corrected)
print("\nGender Value Counts (Corrected):")
print(counts_corrected)

Unique Gender Values (Corrected):
['Female' 'Male' 'Unspecified']

Gender Value Counts (Corrected):
Female         2039
Male           1873
Unspecified      88
Name: gender, dtype: int64


In [44]:
# Reading the first five rows of the transaction dataset
transaction_data.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


## Accuracy and Completeness Assessment

***Check for data accuracy issues, such as missing values in critical columns.***

**Transaction Data**

In [45]:
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


In [46]:
# Accuracy Assessment
import numpy as np
# Check for missing values in key columns
missing_values = transaction_data.isnull().sum()

# Calculate the percentage of missing values in each column
total_records = transaction_data.shape[0]
percentage_missing = (missing_values / total_records) * 100

# Print results
print(np.char.center('Missing Values in Transaction Dataset', 60, '*'))
print(missing_values)
print("\nPercentage of Missing Values:")
print(percentage_missing)

***********Missing Values in Transaction Dataset************
customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64

Percentage of Missing Values:
customer_id           0.0
address               0.0
postcode              0.0
state                 0.0
country               0.0
property_valuation    0.0
dtype: float64


In [47]:
transaction_data['property_valuation'].describe()

count    3999.000000
mean        7.514379
std         2.824663
min         1.000000
25%         6.000000
50%         8.000000
75%        10.000000
max        12.000000
Name: property_valuation, dtype: float64

In [61]:
duplicate_rows = transaction_data[transaction_data.duplicated()]
# Print duplicate rows, if any
print("Duplicate Rows in Transaction Dataset:")
print(duplicate_rows)


Duplicate Rows in Transaction Dataset:
Empty DataFrame
Columns: [customer_id, address, postcode, state, country, property_valuation]
Index: []


**Demographic_data**

In [48]:
demographic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

In [49]:
# Check for missing values in key columns
missing_values = demographic_data.isnull().sum()

# Calculate the percentage of missing values in each column
total_records = demographic_data.shape[0]
percentage_missing = (missing_values / total_records) * 100

# Print results
print(np.char.center('Missing Values in Demographic Dataset', 60, '*'))
print(missing_values)
print("\nPercentage of Missing Values:")
print(percentage_missing)

***********Missing Values in Demographic Dataset************
customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64

Percentage of Missing Values:
customer_id                             0.000
first_name                              0.000
last_name                               3.125
gender                                  0.000
past_3_years_bike_related_purchases     0.000
DOB                                     2.175
job_title                              12.650
job_industry_c

**Customer Data**

In [50]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [51]:
# Check for missing values in key columns
missing_values = customer_data.isnull().sum()

# Calculate the percentage of missing values in each column
total_records = customer_data.shape[0]
percentage_missing = (missing_values / total_records) * 100

# Print results
print(np.char.center('Missing Values in Demographic Dataset', 60, '*'))
print(missing_values)
print("\nPercentage of Missing Values:")
print(percentage_missing)

***********Missing Values in Demographic Dataset************
transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64

Percentage of Missing Values:
transaction_id             0.000
product_id                 0.000
customer_id                0.000
transaction_date           0.000
online_order               1.800
order_status               0.000
brand                      0.985
product_line               0.985
product_class              0.985
product_size               0.985
list_price                 0.000
standard_cost              0.985
product_first_sold_date    0.985
dtype: float64


Since all the missing values are less than 5 percent for all the columns we will drop all the missing values

In [54]:
# Drop rows with missing values from the Demographic Dataset
customer_data_cleaned = customer_data.dropna()

# Verify that missing rows have been removed
print("Demographic Dataset After Dropping Missing Values:")
print(customer_data_cleaned)

# You can also reset the index if you want continuous index values
customer_data_cleaned.reset_index(drop=True, inplace=True)

Demographic Dataset After Dropping Missing Values:
       transaction_id  product_id  customer_id transaction_date  online_order  \
0                   1           2         2950       2017-02-25           0.0   
1                   2           3         3120       2017-05-21           1.0   
2                   3          37          402       2017-10-16           0.0   
3                   4          88         3135       2017-08-31           0.0   
4                   5          78          787       2017-10-01           1.0   
...               ...         ...          ...              ...           ...   
19995           19996          51         1018       2017-06-24           1.0   
19996           19997          41          127       2017-11-09           1.0   
19997           19998          87         2284       2017-04-14           1.0   
19998           19999           6         2764       2017-07-03           0.0   
19999           20000          11         1144       2017-

In [55]:
# Check for missing values in key columns
missing_values = customer_data_cleaned.isnull().sum()

# Calculate the percentage of missing values in each column
total_records = customer_data_cleaned.shape[0]
percentage_missing = (missing_values / total_records) * 100

# Print results
print(np.char.center('Missing Values in Demographic Dataset', 60, '*'))
print(missing_values)
print("\nPercentage of Missing Values:")
print(percentage_missing)

***********Missing Values in Demographic Dataset************
transaction_id             0
product_id                 0
customer_id                0
transaction_date           0
online_order               0
order_status               0
brand                      0
product_line               0
product_class              0
product_size               0
list_price                 0
standard_cost              0
product_first_sold_date    0
dtype: int64

Percentage of Missing Values:
transaction_id             0.0
product_id                 0.0
customer_id                0.0
transaction_date           0.0
online_order               0.0
order_status               0.0
brand                      0.0
product_line               0.0
product_class              0.0
product_size               0.0
list_price                 0.0
standard_cost              0.0
product_first_sold_date    0.0
dtype: float64


In [56]:
customer_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19445 entries, 0 to 19444
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           19445 non-null  int64         
 1   product_id               19445 non-null  int64         
 2   customer_id              19445 non-null  int64         
 3   transaction_date         19445 non-null  datetime64[ns]
 4   online_order             19445 non-null  float64       
 5   order_status             19445 non-null  object        
 6   brand                    19445 non-null  object        
 7   product_line             19445 non-null  object        
 8   product_class            19445 non-null  object        
 9   product_size             19445 non-null  object        
 10  list_price               19445 non-null  float64       
 11  standard_cost            19445 non-null  float64       
 12  product_first_sold_date  19445 n

In [57]:
# Check for duplicate rows in the Transaction Dataset
duplicate_rows= customer_data_cleaned[customer_data_cleaned.duplicated()]

# Print duplicate rows, if any
print("Duplicate Rows in Transaction Dataset:")
print(duplicate_rows)

Duplicate Rows in Transaction Dataset:
Empty DataFrame
Columns: [transaction_id, product_id, customer_id, transaction_date, online_order, order_status, brand, product_line, product_class, product_size, list_price, standard_cost, product_first_sold_date]
Index: []
