# Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from utils import *

# Data

In [2]:
df = pd.read_csv(
    r'../../data/supply_chain_data.csv',
    encoding='utf-8',
)

In [3]:
df.head()

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,haircare,SKU0,69.808006,55,802,8661.996792,Non-binary,58,7,96,...,Mumbai,29,215,29,46.279879,Pending,0.22641,Road,Route B,187.752075
1,skincare,SKU1,14.843523,95,736,7460.900065,Female,53,30,37,...,Mumbai,23,517,30,33.616769,Pending,4.854068,Road,Route B,503.065579
2,haircare,SKU2,11.319683,34,8,9577.749626,Unknown,1,10,88,...,Mumbai,12,971,27,30.688019,Pending,4.580593,Air,Route C,141.920282
3,skincare,SKU3,61.163343,68,83,7766.836426,Non-binary,23,13,59,...,Kolkata,24,937,18,35.624741,Fail,4.746649,Rail,Route A,254.776159
4,skincare,SKU4,4.805496,26,871,2686.505152,Non-binary,5,3,56,...,Delhi,5,414,3,92.065161,Fail,3.14558,Air,Route A,923.440632


In [4]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Overview

In [5]:
print(f"Num of variables:", df.shape[1])
print(f"Num of rows:", df.shape[0])

Num of variables: 24
Num of rows: 100


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   product_type             100 non-null    object 
 1   sku                      100 non-null    object 
 2   price                    100 non-null    float64
 3   availability             100 non-null    int64  
 4   number_of_products_sold  100 non-null    int64  
 5   revenue_generated        100 non-null    float64
 6   customer_demographics    100 non-null    object 
 7   stock_levels             100 non-null    int64  
 8   lead_times               100 non-null    int64  
 9   order_quantities         100 non-null    int64  
 10  shipping_times           100 non-null    int64  
 11  shipping_carriers        100 non-null    object 
 12  shipping_costs           100 non-null    float64
 13  supplier_name            100 non-null    object 
 14  location                 10

## Data

## Grouping variables by their respective data types

###  Numerical and object

In [7]:
numerical_cols, categorical_cols, date_cols= group_columns_by_type(df, display_info=True)

Total numeric columns: 15
Numeric columns: ['price', 'availability', 'number_of_products_sold', 'revenue_generated', 'stock_levels', 'lead_times', 'order_quantities', 'shipping_times', 'shipping_costs', 'lead_time', 'production_volumes', 'manufacturing_lead_time', 'manufacturing_costs', 'defect_rates', 'costs']

Total categorical columns: 9
Categorical columns: ['product_type', 'sku', 'customer_demographics', 'shipping_carriers', 'supplier_name', 'location', 'inspection_results', 'transportation_modes', 'routes']



## Datetime

In [8]:
df[categorical_cols].head()

Unnamed: 0,product_type,sku,customer_demographics,shipping_carriers,supplier_name,location,inspection_results,transportation_modes,routes
0,haircare,SKU0,Non-binary,Carrier B,Supplier 3,Mumbai,Pending,Road,Route B
1,skincare,SKU1,Female,Carrier A,Supplier 3,Mumbai,Pending,Road,Route B
2,haircare,SKU2,Unknown,Carrier B,Supplier 1,Mumbai,Pending,Air,Route C
3,skincare,SKU3,Non-binary,Carrier C,Supplier 5,Kolkata,Fail,Rail,Route A
4,skincare,SKU4,Non-binary,Carrier A,Supplier 1,Delhi,Fail,Air,Route A


=> Has no datetime 

# Duplicated

In [9]:
df.duplicated().sum()

np.int64(0)

# Missing values

In [10]:
df.isnull().sum()

product_type               0
sku                        0
price                      0
availability               0
number_of_products_sold    0
revenue_generated          0
customer_demographics      0
stock_levels               0
lead_times                 0
order_quantities           0
shipping_times             0
shipping_carriers          0
shipping_costs             0
supplier_name              0
location                   0
lead_time                  0
production_volumes         0
manufacturing_lead_time    0
manufacturing_costs        0
inspection_results         0
defect_rates               0
transportation_modes       0
routes                     0
costs                      0
dtype: int64

# Save

## Metadata `.json`

In [11]:
save_metadata(
    df,
    r"../../data/metadata/metadata1.json",
)

## To `.csv`

In [12]:
df.to_csv(
    r"../../data\clean\clean1.csv",
    index=False,
    encoding="utf-8",
)