In [1]:
import pandas as pd

In [3]:
main_df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='ISO-8859-1')

In [4]:
missing_values = main_df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

In [5]:
date_cols = ['order date (DateOrders)', 'shipping date (DateOrders)']
main_df[date_cols[0]] = pd.to_datetime(main_df[date_cols[0]], errors='coerce')
main_df[date_cols[1]] = pd.to_datetime(main_df[date_cols[1]], errors='coerce')

In [6]:
date_parse_issues = main_df[main_df[date_cols[0]].isna() | main_df[date_cols[1]].isna()]

In [10]:
date_parse_issues

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode


In [7]:
numerical_cols = main_df.select_dtypes(include=['float64', 'int64']).columns
outlier_flags = {}

for col in numerical_cols:
    Q1 = main_df[col].quantile(0.25)
    Q3 = main_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = main_df[(main_df[col] < lower_bound) | (main_df[col] > upper_bound)]
    if not outliers.empty:
        outlier_flags[col] = len(outliers)

In [9]:
outlier_flags

{'Benefit per order': 18942,
 'Sales per customer': 1943,
 'Customer Id': 1198,
 'Department Id': 362,
 'Latitude': 9,
 'Longitude': 1414,
 'Order Customer Id': 1198,
 'Order Item Discount': 7537,
 'Order Item Product Price': 2048,
 'Order Item Profit Ratio': 17300,
 'Sales': 488,
 'Order Item Total': 1943,
 'Order Profit Per Order': 18942,
 'Product Price': 2048}

In [8]:
data_quality_issues = {
    "Columns with Missing Values": missing_values.to_dict(),
    "Invalid Date Entries": len(date_parse_issues),
    "Outliers Detected in Numerical Columns": outlier_flags
}

data_quality_issues

{'Columns with Missing Values': {'Product Description': 180519,
  'Order Zipcode': 155679,
  'Customer Lname': 8,
  'Customer Zipcode': 3},
 'Invalid Date Entries': 0,
 'Outliers Detected in Numerical Columns': {'Benefit per order': 18942,
  'Sales per customer': 1943,
  'Customer Id': 1198,
  'Department Id': 362,
  'Latitude': 9,
  'Longitude': 1414,
  'Order Customer Id': 1198,
  'Order Item Discount': 7537,
  'Order Item Product Price': 2048,
  'Order Item Profit Ratio': 17300,
  'Sales': 488,
  'Order Item Total': 1943,
  'Order Profit Per Order': 18942,
  'Product Price': 2048}}

## Explanation

In this notebook, we were able to do the following tasks in flagging the obvious data-quality issues - 
- Columns with missing values
- Invalid or unparseable timestamps
- Outliers in numerical fields (using the Interquartile Range method)