# Dataset Exploration Overview 
Comprehensive analysis of retail inventory forecasting dataset

## Importing pandas to load Retail Sales .csv file and explore data

In [57]:
import pandas as pd

## Configuring display properties

In [58]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Loading dataset into data frame for examination

In [59]:
df = pd.read_csv('../data/raw/retail_store_inventory.csv')

## Key analysis points

In [76]:
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage().sum()/1024**2:.2f} MB")
print(f"Column types:\n{df.dtypes}\n\n")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Number of stores: {df['Store ID'].nunique()}")
print(f"Number of products: {df['Product ID'].nunique()}")
#print(f"Number of Store-Product-Date combinations: {df[['Store ID', 'Product ID', 'Date']].nunique()}")
print(f"Product categories: {df['Category'].unique()}")
print(f"Regions: {df['Region'].unique()}")
print(f"Weather conditions: {df['Weather Condition'].unique()}")
print(f"Seasons: {df['Seasonality'].unique()}")

Dataset shape: (73100, 15)
Memory usage: 8.37 MB
Column types:
Date                   object
Store ID               object
Product ID             object
Category               object
Region                 object
Inventory Level         int64
Units Sold              int64
Units Ordered           int64
Demand Forecast       float64
Price                 float64
Discount                int64
Weather Condition      object
Holiday/Promotion       int64
Competitor Pricing    float64
Seasonality            object
dtype: object


Date range: 2022-01-01 to 2024-01-01
Number of stores: 5
Number of products: 20
Product categories: ['Groceries' 'Toys' 'Electronics' 'Furniture' 'Clothing']
Regions: ['North' 'South' 'West' 'East']
Weather conditions: ['Rainy' 'Sunny' 'Cloudy' 'Snowy']
Seasons: ['Autumn' 'Summer' 'Winter' 'Spring']


## Data quality

In [77]:
print(f"\nNumber of missing values: {df.isnull().sum().sum()}")
print(f"Number of duplicated values: {df.duplicated().sum()}")


Number of missing values: 0
Number of duplicated values: 0


## Date continuity

In [91]:
df['Date'] = pd.to_datetime(df['Date'])
df_sorted = df.sort_values('Date')
date_diff = df_sorted['Date'].diff()
gaps = date_diff[date_diff > pd.Timedelta(days=1)]

if len(gaps) > 0:
    print(f"Date gaps found: {len(gaps)} instances")
    print("Largest gaps:")
    print(gaps.sort_values(ascending=False).head())
else:
    print("No date gaps detected - continuous daily data")

No date gaps detected - continuous daily data


## First 5 rows of dataset

In [92]:
df.head(5)

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality,Day_of_Week,Month
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.5,20,Rainy,0,29.69,Autumn,Saturday,January
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn,Saturday,January
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer,Saturday,January
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn,Saturday,January
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer,Saturday,January
