In [2]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import ttest_ind 
import statsmodels.api as sm 

In [4]:
data = pd.read_csv('DSS Data.csv')

In [6]:
data

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,haircare,SKU0,69.808006,55,802,8661.996792,Non-binary,58,7,96,...,Mumbai,29,215,29,46.279879,Pending,0.426410,Road,Route B,197.752075
1,skincare,SKU1,14.843523,95,736,7460.900065,Female,53,30,37,...,Mumbai,23,517,30,33.616769,Pending,5.054068,Road,Route B,513.065579
2,haircare,SKU2,11.319683,34,8,9577.749626,Unknown,1,10,88,...,Mumbai,12,971,27,30.688019,Pending,4.780593,Air,Route C,151.920282
3,skincare,SKU3,61.163343,68,83,7766.836426,Non-binary,23,13,59,...,Kolkata,24,937,18,35.624741,Fail,4.946649,Rail,Route A,264.776159
4,skincare,SKU4,4.805496,26,871,2686.505152,Non-binary,5,3,56,...,Delhi,5,414,3,92.065161,Fail,3.345580,Air,Route A,933.440632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,haircare,SKU95,77.903927,65,672,7386.363944,Unknown,15,14,26,...,Mumbai,18,450,26,58.890686,Pending,1.410882,Air,Route A,788.864241
96,cosmetics,SKU96,24.423131,29,324,7698.424766,Non-binary,67,2,32,...,Mumbai,28,648,28,17.803756,Pending,4.072048,Road,Route A,198.742141
97,haircare,SKU97,3.526111,56,62,4370.916580,Male,46,19,4,...,Mumbai,10,535,13,65.765156,Fail,3.576238,Road,Route A,550.132423
98,skincare,SKU98,19.754605,43,913,8525.952560,Female,53,1,27,...,Chennai,28,581,9,5.604691,Pending,3.108122,Rail,Route A,892.198864


In [8]:
print("Dataset Information:") 
print(data.info()) 
print("\nSummary Statistics:") 
print(data.describe()) 
print("\nMissing Values:") 
print(data.isnull().sum())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product type             100 non-null    object 
 1   SKU                      100 non-null    object 
 2   Price                    100 non-null    float64
 3   Availability             100 non-null    int64  
 4   Number of products sold  100 non-null    int64  
 5   Revenue generated        100 non-null    float64
 6   Customer demographics    100 non-null    object 
 7   Stock levels             100 non-null    int64  
 8   Lead times               100 non-null    int64  
 9   Order quantities         100 non-null    int64  
 10  Shipping times           100 non-null    int64  
 11  Shipping carriers        100 non-null    object 
 12  Shipping costs           100 non-null    float64
 13  Supplier name            100 non-null    object 
 14  Locati

In [13]:
correlation_columns = ['price', 'stock_levels', 'shipping_costs', 'revenue_generated', 'profit'] 
if all(col in data.columns for col in correlation_columns):
    correlation_matrix = data[correlation_columns].corr()
    plt.figure(figsize=(8, 6)) 
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f") 
    plt.title("Correlation Matrix of Supply Chain Metrics") 
    plt.show() 
else: 
    print("Not all columns for correlation analysis are available in the dataset.")  

Not all columns for correlation analysis are available in the dataset.


In [41]:
if "shipping_carriers" in data.columns and "shipping_costs" in data.columns: 
    unique_carriers = data['shipping_carriers'].unique() 
    if len(unique_carriers)>=2:
        carrier_1_data = data[data['shipping_carriers'] == unique_carriers[0]]['shipping_costs'] 
        carrier_2_data = data[data['shipping_carriers'] == unique_carriers[1]]['shipping_costs'] 
        t_stat, p_value = ttest_ind(carrier_1_data, carrier_2_data, equal_var=False) 
         
        print(f"\nT-Test Results for Shipping Costs between {unique_carriers[0]} and {unique_carriers[1]}:") 
        print(f"T-Statistic: {t_stat}, P-Value: {p_value}") 
        if p_value < 0.05: 
            print("Significant difference in shipping costs between the two carriers.") 
        else: 
            print("No significant difference in shipping costs between the two carriers.") 
    else: 
        print("Not enough unique carriers for hypothesis testing.") 
else: 
    print("Required columns for hypothesis testing are missing.") 

Required columns for hypothesis testing are missing.


In [49]:
if "product_type" in data.columns and "revenue_generated" in data.columns: 
    plt.figure(figsize=(10, 6)) 
    sns.barplot(x='product_type', y='revenue_generated', data=data, ci=None, estimator=sum) 
    plt.title('Total Revenue by Product Type') 
    plt.xlabel('Product Type') 
    plt.ylabel('Total Revenue') 
    plt.xticks(rotation=45) 
    plt.show() 
else: 
    print("Required columns for revenue analysis are missing.")

Required columns for revenue analysis are missing.
