In [1]:
## Import the Libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
## Loading file
df = pd.read_csv('dataset_group.csv', parse_dates=['Date'], dayfirst=True)

In [3]:
## samples of the data
df

Unnamed: 0,Date,Order_id,Product
0,2018-01-01,1,yogurt
1,2018-01-01,1,pork
2,2018-01-01,1,sandwich bags
3,2018-01-01,1,lunch meat
4,2018-01-01,1,all- purpose
...,...,...,...
20636,2020-02-25,1138,soda
20637,2020-02-25,1138,paper towels
20638,2020-02-26,1139,soda
20639,2020-02-26,1139,laundry detergent


In [4]:
## Shape of the data
df.shape

(20641, 3)

In [5]:
## Information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20641 entries, 0 to 20640
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      20641 non-null  datetime64[ns]
 1   Order_id  20641 non-null  int64         
 2   Product   20641 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 483.9+ KB


- Exploratory Data Analysis

In [6]:
## Missing values
df.isnull().sum()

Date        0
Order_id    0
Product     0
dtype: int64

- No missing values found

In [7]:
## Checking duplicate values
print("Total duplicate values:" ,df.duplicated().sum())

Total duplicate values: 4730


In [8]:
## Let's validate if the duplicates are of same order are belong to different orders
df[df.duplicated(keep=False)]

Unnamed: 0,Date,Order_id,Product
4,2018-01-01,1,all- purpose
10,2018-01-01,1,all- purpose
11,2018-01-01,1,dinner rolls
13,2018-01-01,1,all- purpose
18,2018-01-01,1,dinner rolls
...,...,...,...
20632,2020-02-25,1138,sandwich bags
20633,2020-02-25,1138,toilet paper
20634,2020-02-25,1138,soda
20635,2020-02-25,1138,soda


- There are 4730 duplicates found in the data that are belonging to same set of orders, let's remove them

In [9]:
df = df.drop_duplicates()

In [10]:
# #Descriptive Summary of the dataset - Numeric features

display(round(df.describe(),2).T)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,15911.0,2018-12-20 06:48:10.226886912,2018-01-01 00:00:00,2018-05-27 12:00:00,2019-01-29 00:00:00,2019-06-18 00:00:00,2020-02-26 00:00:00,
Order_id,15911.0,574.15,1.0,289.5,579.0,859.0,1139.0,328.54


In [11]:
#Categorical
df.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
Product,15911,37,poultry,480


In [12]:
#Checking for anomalies in the data
df['Product'].unique()

array(['yogurt', 'pork', 'sandwich bags', 'lunch meat', 'all- purpose',
       'flour', 'soda', 'butter', 'beef', 'aluminum foil', 'dinner rolls',
       'shampoo', 'mixes', 'soap', 'laundry detergent', 'ice cream',
       'toilet paper', 'hand soap', 'waffles', 'cheeses', 'milk',
       'dishwashing liquid/detergent', 'individual meals', 'cereals',
       'tortillas', 'spaghetti sauce', 'ketchup', 'sandwich loaves',
       'poultry', 'bagels', 'eggs', 'juice', 'pasta', 'paper towels',
       'coffee/tea', 'fruits', 'sugar'], dtype=object)

In [15]:
## Let's check any negative values in the dataset
row_means = df.iloc[:, 1:].mean(axis=1, numeric_only=True)
## checking the negative values
count_negative_means = np.sum(row_means < 0)

In [16]:
print(count_negative_means)

0


- Data seems fine, Let's export for further analysis

In [17]:
df.shape

(15911, 3)

In [18]:
df.to_csv('dataset_group_processed.csv', index=False)


## Rest EDA on Tableau