# Superstore Data Analysis Notebook
## 1) Libraries importation 

In [None]:
# Importing of differents libraries
import pandas as pd 
import matplotlib.pyplot as plt 
import scipy.stats as stats
import seaborn as sns


## 2) Data set loading 

In [None]:
# Loading of the dataset 
df = pd.read_csv(r"C:/Users/Espadon/Desktop/Sample _Superstore.csv", encoding='ISO-8859-1')
df.head()



## 3) Data set cleaning

In [None]:
# Checking for duplicated values
print("Number of duplicated rows:", df.duplicated().sum())

# 2) Checking for missing values
print("\nMissing values per column:", df.isna().sum())

# Dropping duplicates
df = df.drop_duplicates()





## 4) Data preparation

In [None]:
# Number of rows and columns
print("Shape of the dataset (rows, columns):", df.shape)

# Or get a concise summary of the DataFrame
print("\nSummary of the dataset:")
df.info()

# Data preparation 
df['Profit Margin'] = df['Profit'] / df['Sales'] * 100
df['Profit Margin'].head() 

## 5) Exploratory Data Analysis(EDA)
### a)  Basics information and Summary Statistics

In [None]:
# Basic information
print(df.info())
print(df.describe())

# Unique categories
print(df['Category'].unique())
print(df['Region'].unique())
print(df["Segment"].unique())
print(df['Sub-Category'].unique())

# Count of orders by region
print(df['Region'].value_counts())
```
# Summary Statistics
summary_stats = df[['Sales', 'Profit', 'Discount', 'Profit Margin']].describe()
summary_stats 


### b) Profit margin 
#### * Accross regions 

In [None]:
# Analyze the profit margin by regions 
# ANOVA Test:
anova_result1 = stats.f_oneway(df[df['Region'] == 'East']['Profit Margin'],
                              df[df['Region'] == 'West']['Profit Margin'],
                              df[df['Region'] == 'South']['Profit Margin'],
                              df[df['Region'] == 'Central']['Profit Margin'])
# Output results
print("F-value:", anova_result1.statistic)
print("P-value:", anova_result1.pvalue)

if anova_result1.pvalue < 0.05:
    print("Conclusion: There are significant differences in profit margins across regions.")
else:
    print("Conclusion: There are no significant differences in profit margins across regions.")


In [None]:
# Boxplot for profit margin by region
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Region', y='Profit Margin', palette='Set2')
plt.title('Profit margin  Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Profit margin')
plt.show()

#### * Accross differents categories

In [None]:
#2) analyze profit margin across Category 
anova_result2 = stats.f_oneway(df[df["Category"] == "Furniture"]['Profit Margin'],
                               df[df["Category"] == "Office Supplies"]['Profit Margin'],
                               df[df["Category"] == "Technology"]['Profit Margin'])
# Output results
print("F-value:", anova_result2.statistic)
print("P-value:", anova_result2.pvalue)

if anova_result2.pvalue < 0.05:
    print("Conclusion: There are significant differences in profit margins across produts category.")
else:
    print("Conclusion: There are no significant differences in profit margins across product category.")

In [None]:
# Boxplot for profit margin by category
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='category', y='Profit Margin', palette='Set2')
plt.title('Profit margin Distribution by category')
plt.xlabel('Category')
plt.ylabel('Profit margin')
plt.show()

#### * Accross differents segments 

In [None]:
#2)  profit margin across segment 
anova_result3 = stats.f_oneway(df[df["Segment"] == "Consumer"]['Profit Margin'],
                               df[df["segment"] == "Corporate"]['Profit Margin'],
                               df[df["Segment"] == "Home Office"]['Profit Margin'])
# Output results
print("F-value:", anova_result3.statistic)
print("P-value:", anova_result3.pvalue)

if anova_result3.pvalue < 0.05:
    print("Conclusion: There are significant differences in profit margins across segment.")
else:
    print("Conclusion: There are no significant differences in profit margins across segment.")


In [None]:
# Boxplot for profit margin by segment
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='segment', y='Profit Margin', palette='Set2')
plt.title('Profit margin distribution by segment')
plt.xlabel('segment')
plt.ylabel('Profit margin')
plt.show()


#### * Accross subcategory 

In [None]:
# Boxplot for profit margin by subcategory
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Sub-Category', y='Profit Margin', palette='Set2') 
plt.title('Profit Margin Distribution by Sub-category')
plt.xticks(rotation=45, fontsize=8) 
plt.xlabel('Sub-category')
plt.ylabel('Profit Margin (%)')
plt.tight_layout()  
plt.show()

#### c) Products category accross regions 

In [None]:
# Cross-tabulation: Category vs. Region
category_region = pd.crosstab(df['Category'], df['Region'])
print("Category vs. Region:")
category_region
# Stacked Bar Chart: Category vs. Region
category_region.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Category Distribution by Region')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

#### d) Products  sub-category  accross regions 

In [None]:
# Heatmap: Sub-Category vs. Region
plt.figure(figsize=(10, 6))
sns.heatmap(subcategory_region, annot=True, fmt='d', cmap='Blues')
plt.title('Sub-Category Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Sub-Category')
plt.show()

#### e)  Profit margin based on category and regions 

In [None]:
# Cross-tabulation Heatmap: Product Category vs Region (Profit Margin)
ct = pd.crosstab(df['Category'], df['Region'], values=df['Profit Margin'], aggfunc='mean')
sns.heatmap(ct, annot=True, cmap='coolwarm')
plt.title('Average Profit Margin by Category and Region')
plt.show()

#### f) Profit prediction based on sales and discount 

In [1]:
 # profit prediction based on sales and discount
import pandas as pd 
df = pd.read_csv(r"C:/Users/Espadon/Desktop/Sample _Superstore.csv", encoding='ISO-8859-1')
df.head()
import statsmodels.api as sm
X = df[['Sales', 'Discount']]
X = sm.add_constant(X)
y = df['Profit']
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     1866.
Date:                Tue, 25 Feb 2025   Prob (F-statistic):               0.00
Time:                        13:20:19   Log-Likelihood:                -67126.
No. Observations:                9994   AIC:                         1.343e+05
Df Residuals:                    9991   BIC:                         1.343e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         24.3071      2.626      9.256      0.0