<h2>Complete Statistics Implementation</h2>

In [70]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt

<h3>01. Exploratory Data Analysis (EDA)</h3>

In [71]:
# read the dataset
sales_data = pd.read_csv("archive/Sales_Data.csv")

In [72]:
# check the dimensions
sales_data.shape

(185950, 10)

In [73]:
# let's have look at the data
sales_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month,Sales,City,Hour
0,295665,Macbook Pro Laptop,1,1700.0,12/30/2019 0:01,"136 Church St, New York City, NY 10001",12,1700.0,New York City,0
1,295666,LG Washing Machine,1,600.0,12/29/2019 7:03,"562 2nd St, New York City, NY 10001",12,600.0,New York City,7
2,295667,USB-C Charging Cable,1,11.95,12/12/2019 18:21,"277 Main St, New York City, NY 10001",12,11.95,New York City,18
3,295668,27in FHD Monitor,1,149.99,12/22/2019 15:13,"410 6th St, San Francisco, CA 94016",12,149.99,San Francisco,15
4,295669,USB-C Charging Cable,1,11.95,12/18/2019 12:38,"43 Hill St, Atlanta, GA 30301",12,11.95,Atlanta,12


In [74]:
# check datatypes
sales_data.dtypes

Order ID              int64
Product              object
Quantity Ordered      int64
Price Each          float64
Order Date           object
Purchase Address     object
Month                 int64
Sales               float64
City                 object
Hour                  int64
dtype: object

In [75]:
# descriptive statistics
sales_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Order ID,185950.0,,,,230417.569379,51512.73711,141234.0,185831.25,230367.5,275035.75,319670.0
Product,185950.0,19.0,USB-C Charging Cable,21903.0,,,,,,,
Quantity Ordered,185950.0,,,,1.124383,0.442793,1.0,1.0,1.0,1.0,9.0
Price Each,185950.0,,,,184.399735,332.73133,2.99,11.95,14.95,150.0,1700.0
Order Date,185950.0,142395.0,12/15/2019 20:16,8.0,,,,,,,
Purchase Address,185950.0,140787.0,"193 Forest St, San Francisco, CA 94016",9.0,,,,,,,
Month,185950.0,,,,7.05914,3.502996,1.0,4.0,7.0,10.0,12.0
Sales,185950.0,,,,185.490917,332.919771,2.99,11.95,14.95,150.0,3400.0
City,185950.0,9.0,San Francisco,44732.0,,,,,,,
Hour,185950.0,,,,14.413305,5.423416,0.0,11.0,15.0,19.0,23.0


In [76]:
# concise information
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order ID          185950 non-null  int64  
 1   Product           185950 non-null  object 
 2   Quantity Ordered  185950 non-null  int64  
 3   Price Each        185950 non-null  float64
 4   Order Date        185950 non-null  object 
 5   Purchase Address  185950 non-null  object 
 6   Month             185950 non-null  int64  
 7   Sales             185950 non-null  float64
 8   City              185950 non-null  object 
 9   Hour              185950 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 14.2+ MB


In [77]:
# let's extract year and day from Order Date column.
sales_data['Order Date'] = pd.to_datetime(sales_data['Order Date'])
sales_data.dtypes

Order ID                     int64
Product                     object
Quantity Ordered             int64
Price Each                 float64
Order Date          datetime64[ns]
Purchase Address            object
Month                        int64
Sales                      float64
City                        object
Hour                         int64
dtype: object

In [78]:
sales_data['Year'] = sales_data['Order Date'].dt.year
sales_data['Day'] = sales_data['Order Date'].dt.day
sales_data['Day_Name'] = sales_data['Order Date'].dt.day_name()

In [79]:
# extract zipcode from address
sales_data['ZipCode'] = sales_data['Purchase Address'].str[-5:]

In [80]:
# let's check for null/ missing records
sales_data.isnull().sum()

Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
Month               0
Sales               0
City                0
Hour                0
Year                0
Day                 0
Day_Name            0
ZipCode             0
dtype: int64

In [81]:
# duplicate records
sales_data.duplicated().sum()

264

In [82]:
# drop duplicates
sales_data.drop_duplicates(inplace=True)

In [83]:
sales_data.duplicated().sum()

0

In [84]:
# check columns
sales_data.columns

Index(['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date',
       'Purchase Address', 'Month', 'Sales', 'City', 'Hour', 'Year', 'Day',
       'Day_Name', 'ZipCode'],
      dtype='object')

In [85]:
# create a copy of data with re-arranging columns for ease of understanding
final_data = sales_data[['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Sales', 'City', 'ZipCode', 
                         'Year', 'Month', 'Day', 'Day_Name', 'Hour']]

In [86]:

final_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Sales,City,ZipCode,Year,Month,Day,Day_Name,Hour
0,295665,Macbook Pro Laptop,1,1700.0,1700.0,New York City,10001,2019,12,30,Monday,0
1,295666,LG Washing Machine,1,600.0,600.0,New York City,10001,2019,12,29,Sunday,7
2,295667,USB-C Charging Cable,1,11.95,11.95,New York City,10001,2019,12,12,Thursday,18
3,295668,27in FHD Monitor,1,149.99,149.99,San Francisco,94016,2019,12,22,Sunday,15
4,295669,USB-C Charging Cable,1,11.95,11.95,Atlanta,30301,2019,12,18,Wednesday,12


In [87]:
# save final data to a csv file
# final_data.to_csv("archive/final_sales_data.csv", index=False)