# Key Insights from the Superstore Dataset Exploration<br>

## by Ubogun O. Divine-Favour

## Investigation Overview 



## Dataset Overview 



In [1]:
#import libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb 
sb.set_style("darkgrid") 

%matplotlib inline 

# suppress warnings from final output
import warnings
warnings.simplefilter("ignore")


In [2]:
#load dataset 
samp_superstore = pd.read_csv("sample_superstore.csv") 


## Data Wrangling

In [3]:
#create a list containing column names 
col_names = list(samp_superstore.columns) 


In [4]:
def replace_space(df, list_col_names): 
    """
    This function replaces the space in the column names 
    with spaces between their words with '_'
    """
    for col_name in list_col_names: 
        if " " in col_name: 
            new_name = col_name.replace(" ", "_") 
            df.rename(columns = {col_name : new_name}, 
                      inplace = True) 
    return df 

In [5]:
#apply the function 
samp_superstore = replace_space(samp_superstore, col_names) 

#confirm change 
samp_superstore.head(1)

Unnamed: 0,Ship_Mode,Segment,Country,City,State,Postal_Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136


In [6]:
#drop Country column 
samp_superstore.drop("Country", axis = 1, inplace = True) 

#confirm 
samp_superstore.columns

Index(['Ship_Mode', 'Segment', 'City', 'State', 'Postal_Code', 'Region',
       'Category', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [7]:
#change datatype of Postal_Code from int to string 
samp_superstore.Postal_Code = samp_superstore.Postal_Code.astype(str)

#confirm 
samp_superstore.dtypes

Ship_Mode        object
Segment          object
City             object
State            object
Postal_Code      object
Region           object
Category         object
Sub-Category     object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object

In [8]:
def incomplete_code_checker(col_name):
    """This function returns the number 
    of codes in a code column with less 
    than 5 digits"""
    count = 0
    for x in col_name:
        if len(x) < 5: 
            count += 1 
        else: 
            count += 0 
    return count 

In [9]:
#apply to Postal_Code 
incomplete_code_checker(samp_superstore.Postal_Code)


449

In [10]:
#add zero to at the beginning of incomplete postal codes 
samp_superstore.Postal_Code = samp_superstore.Postal_Code.str.pad(5, fillchar = "0")

# confirm 
incomplete_code_checker(samp_superstore.Postal_Code) 


0

In [11]:
#rename Sub-Category column 
samp_superstore.rename(columns = {"Sub-Category" : "Sub_Category"}, inplace = True)

#confirm 
samp_superstore.head(1)

Unnamed: 0,Ship_Mode,Segment,City,State,Postal_Code,Region,Category,Sub_Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136


In [12]:
#drop duplicate rows 
samp_superstore.drop_duplicates(inplace = True)

#confirm 
samp_superstore.duplicated().sum()


0

In [13]:
#change Quantity datatype to string 
samp_superstore.Quantity = samp_superstore.Quantity.astype(str) 

#confirm 
samp_superstore.dtypes


Ship_Mode        object
Segment          object
City             object
State            object
Postal_Code      object
Region           object
Category         object
Sub_Category     object
Sales           float64
Quantity         object
Discount        float64
Profit          float64
dtype: object

In [14]:
#create Quantity order 
quantity_order = ["1", "2", "3", "4", "5", "6", 
                  "7", "8", "9", "10", "11", "12"]


In [15]:
#create the Quantity categorical datatype 
quantity_cat_dtype = pd.api.types.CategoricalDtype(ordered = True, categories = quantity_order) 

#apply created datatype on Quantity column
samp_superstore.Quantity = samp_superstore.Quantity.astype(quantity_cat_dtype)


In [16]:
#create Ship Mode order
ship_mode_order = ["Standard Class", "Second Class", "First Class", 
                   "Same Day"] 


In [17]:
#create the Ship Mode categorical datatype 
ship_mode_dtype = pd.api.types.CategoricalDtype(ordered = True, categories = ship_mode_order) 

#apply created datatype on Ship Mode column
samp_superstore.Ship_Mode = samp_superstore.Ship_Mode.astype(ship_mode_dtype)

#confirm Quantity and Ship Mode datatypes
samp_superstore.dtypes


Ship_Mode       category
Segment           object
City              object
State             object
Postal_Code       object
Region            object
Category          object
Sub_Category      object
Sales            float64
Quantity        category
Discount         float64
Profit           float64
dtype: object

In [18]:
#change data type of Profit column to string
samp_superstore.Profit = samp_superstore.Profit.astype(str)

#confirm change
samp_superstore.dtypes


Ship_Mode       category
Segment           object
City              object
State             object
Postal_Code       object
Region            object
Category          object
Sub_Category      object
Sales            float64
Quantity        category
Discount         float64
Profit            object
dtype: object

In [19]:
def neg_checker(colname): 
    """ This function returns the 
    number of negative string values 
    present in a column 
    """
    count = 0
    for x in colname:
        if "-" in x:
            count += 1
    return count
                      


In [20]:
#apply function on Profit column 
neg_checker(samp_superstore["Profit"])


1869

In [21]:
def convert_to_positive(x): 
    """ This function converts negative 
    string values to positive string
    values """
    if "-" in x:
        return x[1:]
    else: 
        return x 
    

In [22]:
#apply to Profit column 
samp_superstore.Profit = samp_superstore.Profit.apply(convert_to_positive)

#confirm 
neg_checker(samp_superstore["Profit"])


0

In [23]:
samp_superstore.Profit = samp_superstore.Profit.astype(float) 

#confirm change 
samp_superstore.dtypes 


Ship_Mode       category
Segment           object
City              object
State             object
Postal_Code       object
Region            object
Category          object
Sub_Category      object
Sales            float64
Quantity        category
Discount         float64
Profit           float64
dtype: object

In [24]:
#remove null rows
samp_superstore.dropna(inplace = True) 

#confirm 
samp_superstore.isnull().sum()

Ship_Mode       0
Segment         0
City            0
State           0
Postal_Code     0
Region          0
Category        0
Sub_Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64