# Importing Libraries and data sets

In [3]:
import pandas as pd
import numpy as np
import os

In [5]:
path = r'/Users/tanu/Desktop/Instacart Basket Analysis/02 Data'

In [7]:
df_prods = pd.read_csv(os.path.join(path,'Original data','products.csv'), index_col = False)

In [9]:
df_ords = pd.read_csv(os.path.join(path,'Prepared data','orders_wrangled.csv'), index_col = False)

In [11]:
# Find missing values (takes sum of Trues in a  column)
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# creating a subset that contains nulls
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


Addressing missing Values

There a few ways to deal with missing data:
1.Create a new variable that acts like a flag based on the missing value.
2.Impute the value with the mean or median of the column (if the variable is numeric).
3. Remove or filter out the missing data.

In [18]:
df_nan.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,16.0,16.0,16.0,16.0
mean,6684.0,89.9375,10.9375,13.0125
std,12836.665242,33.731229,4.639953,3.881731
min,34.0,26.0,1.0,1.2
25%,459.25,70.75,7.75,12.175
50%,2413.0,98.5,11.5,13.65
75%,3872.75,120.0,14.5,14.425
max,40440.0,126.0,16.0,20.9


since missing values are strings there's not much we can do other than removing them

In [21]:
df_prods.shape

(49693, 5)

In [25]:
# create a new df without the nulls
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [27]:
df_prods_clean.shape

(49677, 5)

49693-49677 = 16

# DUPLICATES

In [32]:
# Create subset of df_prods_clean that contains only rows of duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [34]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [36]:
# Create datafram that doesn't include the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [38]:
df_prods_clean_no_dups.shape

(49672, 5)

# Exporting data

In [41]:
df_prods_clean_no_dups.to_csv(os.path.join(path,'Prepared data','products_checked.csv'))

# TASK

Step 2

In [47]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


'days_since prior order': Has a count less than all other rows inndicating missing values. It also has a measure of zero indicating that some orders could be places within the same day. The max value is 30, which may indicate that orders are capped at 30 days or 1 month. 'order_number': Has a max value of 100, which may mean orders are capped at 100 items. 'order_day of the week': Has min value of 0 and max value of 6 meaning the seven numbers (0-6) represent days of the week. 'order_hour_of_day': Has a min value of 0 and max value of 23 meaning the twenty-four numbers (0-23) represent 24 hrs of the day.

Step 3


In [53]:
for col in df_ords.columns.tolist():
    weird = (df_ords[col].map(type) != df_ords[col].iloc[0].__class__).any()
    if weird:
        print(col)

Unnamed: 0
order_id
user_id
order_number
order_dow
order_hour_of_day
days_since_prior_order


In [55]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(6)
memory usage: 182.7 MB


In [57]:
# Find missing values (takes sum of Trues in a  column)
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

As suspected, the variable that has missing values is the 'days_since_prior_order' which has 206209 values missing. 

In [60]:
# Create subset of the dataframe that contains the nulls
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [62]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [64]:
df_ords['days_since_prior_order'].fillna(7, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ords['days_since_prior_order'].fillna(7, inplace=True)


In [66]:
df_ords[df_ords['order_number']==1]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,7.0
11,11,2168274,2,1,2,11,7.0
26,26,1374495,3,1,1,14,7.0
39,39,3343014,4,1,6,11,7.0
45,45,2717275,5,1,3,12,7.0
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,7.0
3420934,3420934,3189322,206206,1,3,18,7.0
3421002,3421002,2166133,206207,1,6,19,7.0
3421019,3421019,2227043,206208,1,1,15,7.0


Step 7

In [69]:
# Create subset of df_ords that contains only rows of duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [71]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order


Step 8 : There are no duplicates

There are no duplicates in this set because it returned empty. however, if there were duplicates I would execute '.drop_duplicates()' to creat a new datafreame without duplicates.

Step 9: Export results


In [77]:
df_ords.to_csv(os.path.join(path,'Prepared data','orders_checked.csv'))