# Reading exercise

In [1]:
# Library Imports
import pandas as pd
import numpy as np
import os

#### DF Pathways

In [2]:
path = r'C:\Users\Chase\anaconda_projects\Exercise 4\07-2025 Instacart Basket Analysis'

In [3]:
# Orders csv file pathway
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [4]:
# Products csv file pathway
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# Departments csv file pathway
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

## Data Wrangling: Columns

#### Removing Columns

In [6]:
# temp removing the eval_set column
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [7]:
# to remove eval_set column from the df_ords
df_ords = df_ords.drop(columns = ['eval_set'])

#### Missing Column Values

In [8]:
# count missing values in column
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

#### Renaming Columns

In [9]:
# renaming order_dow column
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [10]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


#### Changing Variable Type

In [11]:
#changing data type, like string, so describe() will ignore it
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [12]:
df_ords['order_id'].dtype

dtype('O')

#### Transporting Data

In [13]:
# Imported Department csv file under DF Pathways, checking columns
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [14]:
# Temp change the wide format above to a long format
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [15]:
# Actual change to long format in df
df_dep_t = df_dep.T

In [16]:
# work check
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [17]:
# To change the headers to dep_id and department instead of the num 0
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [18]:
# Take the first row of the df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [19]:
new_header

0    department
Name: department_id, dtype: object

In [20]:
# Take the data under the header row for a new df
df_dep_t_new = df_dep_t[1:]

In [21]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [22]:
# linking old headers with new_header
new_header = df_dep_t.iloc[0]

In [23]:
# New headers
df_dep_t_new.columns = new_header

In [24]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## Data Dictionaries

In [25]:
# Turn department_id into data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [26]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [27]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [28]:
# lookup department
print(data_dict.get('19'))

{'department': 'snacks'}


#### Subsets

In [29]:
# creating subset
df_snacks =  df_prods[df_prods['department_id']==19]

In [30]:
# Right side of code
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [31]:
# wrapped within each other
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [32]:
df_snacks = df_prods[df_prods['department_id']==19]

In [33]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [34]:
# 2nd way to look up dept 19 with loc
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [35]:
# 3rd way to look up dept 19 with isin([19])
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

# Exercise 4.4 Wrangling Steps

#### Question 2

In [36]:
# Looking at which variable needs to be changed
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [37]:
# Changing user_id to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [38]:
df_ords['user_id'].dtype

dtype('O')

#### Question 3

In [39]:
# Renaming days between orders to: days since prior order....I accidentally overwrote the data instead of using a temp change.
df_ords.rename(columns = {'days_between_orders' : 'days_since_prior_order'}, inplace = True)

In [40]:
# Renaming order hour to:  order hour of day....I accidentally overwrote the data instead of using a temp change.
df_ords.rename(columns = {'order_hour' : 'order_hour_of_day'}, inplace = True)

In [41]:
# Renaming order day to: order day of week order....I accidentally overwrote the data instead of using a temp change.
df_ords.rename(columns = {'order_day' : 'order_day_of_week'}, inplace = True)

In [42]:
# double checking it went back to OG headers
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [43]:
# I attempted to first do df_temp for each change but it wasn't working so I asked Copilot for guidance and it suggested combining them into one.
df_temp = df_ords.rename(columns={
    'days_since_prior_order': 'days_between_orders',
    'order_hour_of_day': 'order_hour',
    'orders_day_of_week': 'order_day'})



In [44]:
print(df_temp.columns)

Index(['order_id', 'user_id', 'order_number', 'order_day', 'order_hour',
       'days_between_orders'],
      dtype='object')


#### Question 4

In [45]:
# I attempted order day but it didn't work so I went to the OG variable
df_ords['order_hour_of_day'].value_counts().sort_index()

order_hour_of_day
0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: count, dtype: int64

In [46]:
peak_hour = df_ords['order_hour_of_day'].value_counts().idxmax()

In [47]:
print(peak_hour)

10


#### Question 5

In [48]:
# Reused from the reading
data_dict = df_dep_t_new.to_dict('index')

In [49]:
# Answer to question 5: Department 4 is produce
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

#### Question 6

In [50]:
# Breakfast is department 14
df_breakfast =  df_prods[df_prods['department_id']==14]

In [51]:
print(df_breakfast)

       product_id                                       product_name  \
27             28                                  Wheat Chex Cereal   
33             34                                                NaN   
67             68                            Pancake Mix, Buttermilk   
89             90                                       Smorz Cereal   
210           211   Gluten Free Organic Cereal Coconut Maple Vanilla   
...           ...                                                ...   
49330       49326                            Cereal Variety Fun Pack   
49395       49391            Light and Fluffy Buttermilk Pancake Mix   
49547       49543                          Chocolate Cheerios Cereal   
49637       49633               Shake 'N Pour Buttermilk Pancake Mix   
49667       49663  Ultra Protein Power Crunch Peanut Butter N' Ho...   

       aisle_id  department_id  prices  
27          121             14    10.1  
33          121             14    12.2  
67          

#### Question 7

In [52]:
# Dept aclcohol= 5 , deli= 20 , beverages= 7 , meat/seafood= 12
dinner_party_depts = [5, 7, 12, 20]

In [53]:
df_dinner_party = df_prods[df_prods['department_id'].isin(dinner_party_depts)]

In [54]:
print(df_dinner_party)

       product_id                                    product_name  aisle_id  \
2               3            Robust Golden Unsweetened Oolong Tea        94   
6               7                  Pure Coconut Water With Orange        98   
9              10  Sparkling Orange Juice & Prickly Pear Beverage       115   
10             11                               Peach Mango Juice        31   
16             17                               Rendered Duck Fat        35   
...           ...                                             ...       ...   
49676       49672                          Cafe Mocha K-Cup Packs        26   
49679       49675             Cinnamon Dolce Keurig Brewed K Cups        26   
49680       49676                          Ultra Red Energy Drink        64   
49686       49682                              California Limeade        98   
49688       49684       Vodka, Triple Distilled, Twist of Vanilla       124   

       department_id  prices  
2                  7

#### Question 8

In [55]:
df_dinner_party.shape

(7650, 5)

#### Question 9

In [56]:
# data frame for user_id 1
df_user1=df_ords.loc[df_ords['user_id'] == '1']

In [57]:
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


#### Question 10

In [58]:
df_user1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


#### Question 12

In [59]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'), index=False)

#### Question 13

In [60]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'), index=False)

In [61]:
print(df_ords.shape)


(3421083, 6)


# Reshape departments

In [62]:
# rename df_dep_t_new to df_dep_clean so that the work will save
df_dep_clean = df_dep_t_new.copy()

In [64]:
# new column, convert index of dept_id into a column
df_dep_clean['department_id'] = df_dep_clean.index.astype(int)

In [65]:
print(df_dep_clean.dtypes)
print(df_dep_clean.head())

department_id
department       object
department_id     int64
dtype: object
department_id department  department_id
1                 frozen              1
2                  other              2
3                 bakery              3
4                produce              4
5                alcohol              5


In [68]:
# export
df_dep_clean.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index=False)