### Data Exploration and Visualization

In [1]:
# import all packages

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [9]:
# import the data into the code
store_df = pd.read_csv("Datasets/buybuy.csv")

In [10]:
store_df.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,11/26/2013,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
1,11/26/2015,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
2,3/23/2014,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45.0,120.0,1035,2401.0
3,3/23/2016,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45.0,120.0,900,2088.0
4,5/15/2014,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45.0,120.0,180,418.0


In [11]:
# make a copy of your dataframe
store_df_orginal = store_df.copy()

In [12]:
# checks the size of your dataframe
store_df.shape

(56, 14)

In [13]:
# check for null values
store_df.isnull().sum()

Unnamed: 0          0
Customer ID         0
Customer Age        0
Customer Gender     0
Country             0
State               0
Product Category    0
Sub Category        0
Product             0
Order Quantity      0
Unit Cost           1
Unit Price          2
Cost                0
Revenue             1
dtype: int64

In [19]:
store_df[store_df["Unit Cost"].isnull() == True]

Unnamed: 0.1,Unnamed: 0,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
51,11/17/2015,11331,29,F,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,14,,120.0,630,1663.0


In [20]:
store_df.dtypes

Unnamed: 0           object
Customer ID           int64
Customer Age          int64
Customer Gender      object
Country              object
State                object
Product Category     object
Sub Category         object
Product              object
Order Quantity       object
Unit Cost           float64
Unit Price          float64
Cost                  int64
Revenue             float64
dtype: object

In [22]:
store_df["Unit Cost"].unique()

array([45., nan])

In [23]:
store_df["Unit Price"].unique()

array([120.,  nan])

In [24]:
# check for duplicates
store_df.duplicated().sum()

6

Things to change in the dataframe
1. Replace null values in unit cost with 45
2. Replace null values in unit price with 120
3. Change the name of the date column
4. Change the format of date column to datetime
5. create extra columns to contain day, month and year
6. Drop the duplicated rows


In [26]:
store_df.columns

Index(['Unnamed: 0', 'Customer ID', 'Customer Age', 'Customer Gender',
       'Country', 'State', 'Product Category', 'Sub Category', 'Product',
       'Order Quantity', 'Unit Cost', 'Unit Price', 'Cost', 'Revenue'],
      dtype='object')

In [27]:
# change the column name

store_df.rename(columns={"Unnamed: 0":"Date"}, inplace=True)

In [28]:
store_df.columns

Index(['Date', 'Customer ID', 'Customer Age', 'Customer Gender', 'Country',
       'State', 'Product Category', 'Sub Category', 'Product',
       'Order Quantity', 'Unit Cost', 'Unit Price', 'Cost', 'Revenue'],
      dtype='object')

In [29]:
store_df.dtypes

Date                 object
Customer ID           int64
Customer Age          int64
Customer Gender      object
Country              object
State                object
Product Category     object
Sub Category         object
Product              object
Order Quantity       object
Unit Cost           float64
Unit Price          float64
Cost                  int64
Revenue             float64
dtype: object

In [30]:
store_df["Date"] = pd.to_datetime(store_df["Date"])

In [31]:
store_df.dtypes

Date                datetime64[ns]
Customer ID                  int64
Customer Age                 int64
Customer Gender             object
Country                     object
State                       object
Product Category            object
Sub Category                object
Product                     object
Order Quantity              object
Unit Cost                  float64
Unit Price                 float64
Cost                         int64
Revenue                    float64
dtype: object

In [35]:
store_df.iloc[0,0]

Timestamp('2013-11-26 00:00:00')

In [38]:
# replacing null values in unit cost column
store_df["Unit Cost"] = store_df["Unit Cost"].fillna(45)

In [39]:
store_df.isnull().sum()

Date                0
Customer ID         0
Customer Age        0
Customer Gender     0
Country             0
State               0
Product Category    0
Sub Category        0
Product             0
Order Quantity      0
Unit Cost           0
Unit Price          2
Cost                0
Revenue             1
dtype: int64

In [40]:
# replacing null values in unit price column

store_df["Unit Price"] = store_df["Unit Price"].fillna(120)

In [41]:
store_df.isnull().sum()

Date                0
Customer ID         0
Customer Age        0
Customer Gender     0
Country             0
State               0
Product Category    0
Sub Category        0
Product             0
Order Quantity      0
Unit Cost           0
Unit Price          0
Cost                0
Revenue             1
dtype: int64

In [42]:
store_df.head()

Unnamed: 0,Date,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,2013-11-26,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
1,2015-11-26,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
2,2014-03-23,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45.0,120.0,1035,2401.0
3,2016-03-23,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45.0,120.0,900,2088.0
4,2014-05-15,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45.0,120.0,180,418.0


In [52]:
120*23

2760

In [54]:
store_df.dtypes

Date                datetime64[ns]
Customer ID                  int64
Customer Age                 int64
Customer Gender             object
Country                     object
State                       object
Product Category            object
Sub Category                object
Product                     object
Order Quantity              object
Unit Cost                  float64
Unit Price                 float64
Cost                         int64
Revenue                    float64
dtype: object

In [53]:
store_df["Revenue"] = store_df["Order Quantity"]*store_df["Unit Price"]

TypeError: can't multiply sequence by non-int of type 'float'

In [45]:
store_df["Revenue"] = store_df["Order Quantity"].astype("float")*store_df["Unit Price"]

ValueError: could not convert string to float: '-'

In [57]:
for x in range(len(store_df_orginal)):
    if store_df_orginal.loc[x, "Order Quantity"] == "-":
        print (store_df_orginal.loc[x,:])

Unnamed: 0                    2/22/2016
Customer ID                       11094
Customer Age                         35
Customer Gender                       M
Country                       Australia
State                          Victoria
Product Category            Accessories
Sub Category                 Bike Racks
Product             Hitch Rack - 4-Bike
Order Quantity                        -
Unit Cost                          45.0
Unit Price                        120.0
Cost                                945
Revenue                          1991.0
Name: 9, dtype: object


In [46]:
store_df[store_df["Order Quantity"]=="-"]

Unnamed: 0,Date,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
9,2016-02-22,11094,35,M,Australia,Victoria,Accessories,Bike Racks,Hitch Rack - 4-Bike,-,45.0,120.0,945,1991.0


In [47]:
945/45

21.0

In [67]:
store_df.loc[9,"Order Quantity"] = 21

In [66]:
store_df.loc[9,"Cost"]

945

In [62]:
store_df_orginal.iloc[9,12]

945

In [50]:
store_df[store_df["Order Quantity"]=="-"]

Unnamed: 0,Date,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue


In [58]:
store_df["Revenue"] = store_df["Order Quantity"].astype("float")*store_df["Unit Price"]

In [68]:
store_df_orginal.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,11/26/2013,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
1,11/26/2015,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,950.0
2,3/23/2014,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45.0,120.0,1035,2401.0
3,3/23/2016,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45.0,120.0,900,2088.0
4,5/15/2014,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45.0,120.0,180,418.0


In [59]:
store_df.head()

Unnamed: 0,Date,Customer ID,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Product,Order Quantity,Unit Cost,Unit Price,Cost,Revenue
0,2013-11-26,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,960.0
1,2015-11-26,11019,19,M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45.0,120.0,360,960.0
2,2014-03-23,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45.0,120.0,1035,2760.0
3,2016-03-23,11039,49,M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45.0,120.0,900,2400.0
4,2014-05-15,11046,47,F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45.0,120.0,180,480.0


In [69]:
store_df_orginal.isnull().sum()

Unnamed: 0          0
Customer ID         0
Customer Age        0
Customer Gender     0
Country             0
State               0
Product Category    0
Sub Category        0
Product             0
Order Quantity      0
Unit Cost           1
Unit Price          2
Cost                0
Revenue             1
dtype: int64

In [60]:
store_df.isnull().sum()

Date                0
Customer ID         0
Customer Age        0
Customer Gender     0
Country             0
State               0
Product Category    0
Sub Category        0
Product             0
Order Quantity      0
Unit Cost           0
Unit Price          0
Cost                0
Revenue             0
dtype: int64

In [70]:
store_df.duplicated().sum()

6

In [72]:
# drop duplicated rows
store_df.drop_duplicates(inplace=True)

In [73]:
store_df.duplicated().sum()

0

In [74]:
store_df.shape

(50, 14)

In [75]:
store_df_orginal.shape

(56, 14)