In [None]:
pip install missingno

: 

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from scipy.stats import zscore, probplot

# Enable inline plotting for Jupyter Notebook
%matplotlib inline


In [None]:
df = pd.read_csv('retail_store_inventory.csv')
print(df.shape)
df.head()

Basic Info and Summary

In [None]:
# Display basic dataset information
print("Basic Info:")
print(df.info())

# Summary statistics for numerical columns
print("\nSummary Statistics:")
display(df.describe())


Checking and Visualizing the Missing Values

In [None]:
# Visualizing missing values using missingno
msno.matrix(df)
plt.title("Missing Values Matrix")
plt.show()

# Checking missing value statistics
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({"Missing Values": missing_values, "Percentage": missing_percent})

# Display only columns with missing values
print("\nMissing Value Statistics:")
display(missing_data[missing_data["Missing Values"] > 0])


In [None]:
# Count missing values per column
missing_values = df.isnull().sum()

# Print only columns with missing values
print("Missing Values Count:")
print(missing_values[missing_values > 0])


In [None]:
df.duplicated().sum()

In [None]:
import pandas as pd

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort by Date
df = df.sort_values(by='Date')

# Apply Moving Averages
df['Units_Sold_MA7'] = df['Units Sold'].rolling(window=7).mean()
df['Units_Sold_MA14'] = df['Units Sold'].rolling(window=14).mean()


df['Units_Ordered_MA7'] = df['Units Ordered'].rolling(window=7).mean()
df['Units_Ordered_MA14'] = df['Units Ordered'].rolling(window=14).mean()

df['Demand_Forecast_MA7'] = df['Demand Forecast'].rolling(window=7).mean()
df['Demand_Forecast_MA14'] = df['Demand Forecast'].rolling(window=14).mean()

df['Price_MA7'] = df['Price'].rolling(window=7).mean()
df['Price_MA14'] = df['Price'].rolling(window=14).mean()

df['Competitor_Pricing_MA7'] = df['Competitor Pricing'].rolling(window=7).mean()
df['Competitor_Pricing_MA14'] = df['Competitor Pricing'].rolling(window=14).mean()

# Drop initial NaN values from rolling averages
df.dropna(inplace=True)

# Check updated dataset
print(df.head())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns
numeric_cols = df.select_dtypes(include=['number'])

# Compute correlation matrix
corr_matrix = numeric_cols.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Generate heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Set title
plt.title("Correlation Heatmap of Features", fontsize=14)

# Show plot
plt.show()


# Checking whether outliers are present or not 

In [None]:
from scipy.stats import zscore

# Select numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns

# Compute absolute Z-scores
z_scores = np.abs(zscore(df[numeric_cols]))

# Identify outliers (Z-score > 3)
outliers = df[(z_scores > 3).any(axis=1)]

print(f"Number of Outliers Detected: {len(outliers)}")


IQR Method

In [None]:
# Calculate Q1, Q3, and IQR for all numeric columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers (values outside 1.5*IQR range)
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR)))

# Count the number of outliers per column
print(outliers.sum())


Box Plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Boxplot for all numeric features
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[numeric_cols])
plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
plt.title("Boxplot for Outlier Detection")
plt.show()


Histogram Plot

In [None]:
# Histogram for 'Demand Forecast'
plt.figure(figsize=(8, 6))
sns.histplot(df['Demand Forecast'], kde=True)
plt.title("Distribution of Demand Forecast")
plt.show()


In [None]:
print(df['Demand Forecast'].max())

In [None]:
print(df['Demand Forecast'].min())

In [None]:
print(df.shape)
df = df[df['Demand Forecast'] >= 0]  # Remove negative values
print(df.shape)


In [None]:
print(df.shape)
df = df[df['Units Sold'] <= df['Inventory Level']]  # Remove impossible sales cases
print(df.shape)

In [None]:
from scipy.stats.mstats import winsorize

# Apply winsorization to Demand Forecast
df['Demand Forecast'] = winsorize(df['Demand Forecast'], limits=[0.05, 0.05])  # Capping bottom and top 5%

# Optional: Apply to other variables like 'Units Sold' if needed
df['Units Sold'] = winsorize(df['Units Sold'], limits=[0.05, 0.05])
df.head()

In [None]:
print(df.shape)

In [None]:
print(df['Demand Forecast'].max())

In [None]:
print(df['Demand Forecast'].min())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df['Demand Forecast'], bins=30, kde=True)
plt.title("Demand Forecast Distribution After Winsorization")
plt.show()


In [None]:
# Selecting only numerical columns for correlation
numeric_df = df.select_dtypes(include=['number'])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap of Retail Inventory Dataset")
plt.show()
