In [1]:
#import libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb 
sb.set_style("darkgrid") 

%matplotlib inline 

In [2]:
#load dataset 
samp_superstore = pd.read_csv("sample_superstore.csv") 


## Data Wrangling

### Assessment

In [3]:
#view random samples of the dataset 
samp_superstore.sample(10)

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
8200,Standard Class,Consumer,United States,Chicago,Illinois,60610,Central,Office Supplies,Storage,72.784,1,0.2,-18.196
512,First Class,Consumer,United States,Newark,Ohio,43055,East,Office Supplies,Art,7.152,3,0.2,0.7152
2835,Standard Class,Consumer,United States,Los Angeles,California,90036,West,Office Supplies,Appliances,10.89,1,0.0,2.8314
9270,Standard Class,Consumer,United States,New York City,New York,10035,East,Office Supplies,Binders,4305.552,6,0.2,1453.1238
4208,Second Class,Consumer,United States,Cranston,Rhode Island,2920,East,Furniture,Chairs,1604.9,5,0.0,481.47
1719,Standard Class,Consumer,United States,New York City,New York,10011,East,Furniture,Furnishings,39.76,8,0.0,12.3256
7392,Standard Class,Consumer,United States,Los Angeles,California,90049,West,Office Supplies,Paper,15.7,5,0.0,7.065
4912,Standard Class,Consumer,United States,Johnson City,Tennessee,37604,South,Office Supplies,Binders,5.97,5,0.7,-4.577
9363,Standard Class,Home Office,United States,Seattle,Washington,98105,West,Furniture,Furnishings,22.14,3,0.0,6.4206
1333,Standard Class,Consumer,United States,Dallas,Texas,75220,Central,Office Supplies,Appliances,7.96,2,0.8,-13.93


In [4]:
#obtain descriptive summary of the dataset 
samp_superstore.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB


In [5]:
#check for number of duplicate rows 
samp_superstore.duplicated().sum()

17

In [6]:
#obtain unique values of ship mode 
samp_superstore["Ship Mode"].unique()

array(['Second Class', 'Standard Class', 'First Class', 'Same Day'],
      dtype=object)

In [7]:
#obtain unique Segment values 
samp_superstore["Segment"].unique()

array(['Consumer', 'Corporate', 'Home Office'], dtype=object)

In [8]:
#obtain the unique values in Country
samp_superstore.Country.unique()


array(['United States'], dtype=object)

In [9]:
# obtain the unique Regions
samp_superstore.Region.unique()


array(['South', 'West', 'Central', 'East'], dtype=object)

In [10]:
#obtain the number of unique values in Category
samp_superstore.Category.value_counts()


Office Supplies    6026
Furniture          2121
Technology         1847
Name: Category, dtype: int64

In [11]:
#obtain the number of unique values in  Sub-Category column 
samp_superstore["Sub-Category"].nunique()


17

In [12]:
#obtain the count of the unique values in Sub-Category
samp_superstore["Sub-Category"].value_counts()


Binders        1523
Paper          1370
Furnishings     957
Phones          889
Storage         846
Art             796
Accessories     775
Chairs          617
Appliances      466
Labels          364
Tables          319
Envelopes       254
Bookcases       228
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: Sub-Category, dtype: int64

In [13]:
#check for number of unique States 
samp_superstore.State.nunique()


49

In [14]:
#check number of Cities 
samp_superstore.City.nunique()


531

### Issues 

- Put underscore in place of spaces in column names 
- All country column values are "United States"
- Dtype of Postal Code as int
- Zip codes with less than 5 numeric digits 
- "-" in column name (eg Sub-Category)

### Cleaning

One of the issues noted in the assessment phase is the presence of a space between the words of some column names.

Below, the space in those column names will be replaced with an underscore ("\_") as this allows more flexibility in the use of the column names.

In [15]:
#create a list containing column names 
col_names = list(samp_superstore.columns) 


In [16]:
def replace_space(df, list_col_names): 
    """
    This function replaces the space in the column names 
    with spaces between their words with '_'
    """
    for col_name in list_col_names: 
        if " " in col_name: 
            new_name = col_name.replace(" ", "_") 
            df.rename(columns = {col_name : new_name}, 
                      inplace = True) 
    return df 


In [17]:
#apply the function 
samp_superstore = replace_space(samp_superstore, col_names) 

#confirm change 
samp_superstore.head(1)

Unnamed: 0,Ship_Mode,Segment,Country,City,State,Postal_Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136
