## Statistical analysis on the superstore dataset

In [1]:
# Importing the required modules for the project.

import pandas as pd        # To create and manipulate pandas DataFrames 
import numpy as np         # for further statistical analysis of DataFrames
import xlrd                # for reading excel workbooks/worksheets.

In [2]:
# Reading the excel data into the df variable.

df = pd.read_excel('Superstore.xls')

In [4]:
# Taking a closer look at the data through the first 5 records of the whole dataset using the head() function.

df.head(5)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country/Region,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2020-152156,2020-11-08,2020-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2020-152156,2020-11-08,2020-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2020-138688,2020-06-12,2020-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2019-108966,2019-10-11,2019-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2019-108966,2019-10-11,2019-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [5]:
# Checking for any duplicate values

print(df.duplicated())

0       False
1       False
2       False
3       False
4       False
        ...  
9989    False
9990    False
9991    False
9992    False
9993    False
Length: 9994, dtype: bool


In [6]:
# Counting and adding all duplicate values inside the datatset

df.duplicated().sum()

0

In [7]:
# Dropping all duplicate values from the dataset,from the main dataset as well so as to clean it a little bit.

df.drop_duplicates(inplace = True)

In [8]:
# Listing and taking a closer look at all the columns inside the dataset. 

df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country/Region', 'City',
       'State', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'],
      dtype='object')

In [9]:
# selecting a number of columns from the dataset, and printing the first 5 values from each using the head() function.


print(df[['Product ID', 'Quantity', 'Sales', 'Profit']].head())

        Product ID  Quantity     Sales    Profit
0  FUR-BO-10001798         2  261.9600   41.9136
1  FUR-CH-10000454         3  731.9400  219.5820
2  OFF-LA-10000240         2   14.6200    6.8714
3  FUR-TA-10000577         5  957.5775 -383.0310
4  OFF-ST-10000760         2   22.3680    2.5164


In [10]:
# Finding the overall sum of the "Quantinty column" in the dataset 

print(df[['Quantity']].sum())

Quantity    37873
dtype: int64


In [12]:
# Sample stats on the Sales column values.
# Converting the "Sales" column data from float to integer using the int() function,
# finding the sum of all the sales made in the dataset using the "Sales" values provided.

print(int(df[['Sales']].sum()))

2297200


### Working with the dataset to bring out the 5-number summary i.e Q1(quartile of the lower quarter/left hand side of the dataset),Q2(median of the whole dataset), Q3(quartile of the higher quarter/right hand side of the data), Mean value(average value of the whole dataset), Min(lowest value of the dataset), Max(Highest value of the dataset)

### Chose to analyse the Quantity, Sales and Profits columns.

#### Sales column

#### Sorting the Sales Column(values) in ascending order

In [129]:
## Sales = str(df[['Sales']])
Sales = df[["Sales"]].sort_values(by = "Sales", ascending = True).round(2)

In [124]:
print(Sales.head(15).round(2))

      Sales
4101   0.44
9292   0.56
8658   0.84
4711   0.85
2106   0.88
7548   0.90
8033   0.98
2761   0.99
8024   1.04
1332   1.08
4933   1.08
976    1.08
987    1.11
2605   1.17
4874   1.19


In [131]:
print(Sales.tail(10))

         Sales
6425   8399.98
8488   8749.95
4277   9099.93
4098   9449.95
9039   9892.74
4190  10499.97
2623  11199.97
8153  13999.96
6826  17499.95
2697  22638.48


#### Mean Value of the Sales column data

In [133]:
average_sales = Sales.mean().round(2)
print(average_sales)

Sales    229.86
dtype: float64


#### Highest Value in the Sales column

In [134]:
maximum_sales = Sales.max().round(2)
print(maximum_sales)

Sales    22638.48
dtype: float64


#### Lowest Value in the Sales column

In [135]:
minimum_sales = Sales.min().round(2)
print(minimum_sales)

Sales    0.44
dtype: float64


#### Range of values between the highest and lowest values in the Sales column data

In [147]:
Range = maximum_sales - minimum_sales 
print('The range between maximum_sales and minimum_sales is ', Range)

The range between maximum_sales and minimum_sales is  Sales    22638.04
dtype: float64


#### Q2(Median of the whole sales column)

In [136]:
median_sales = Sales.median()
print(median_sales)

Sales    54.49
dtype: float64


#### Q1(Median of the lower quarter of the data/ 25th percentile of the data)

In [143]:
Q1 = np.percentile(Sales, 25)
print('The Q1 value of the Sales column is ', Q1)

The Q1 value of the Sales column is  17.28


#### Q3(Median of the upper quarter of the data/ 75th percentile of the data)

In [144]:
Q3 = np.percentile(Sales, 75)
print('The Q3 value of the Sales column is ', Q3)

The Q3 value of the Sales column is  209.94


#### Interquartile Range Value for the sales column

In [145]:
Interquartile_range = Q3 - Q1
print('The interquartile range of the Sales column data is ', Interquartile_range)

The interquartile range of the Sales column data is  192.66


#### Total sum of all sales values in the Sales column

In [149]:
sum_of_total_sales = Sales.sum().round(2)
print(sum_of_total_sales)

Sales    2297200.74
dtype: float64
