# Step 1 - Merge DataFrames (df_ords and df_ords_prior)

In [2]:
import pandas as pd
import os


In [4]:
# Define the path to your project folder
project_path = '/Users/dela/Documents/15-01-2025 Instacart Basket Analysis'

# Path to Prepared Data
prepared_data_path = os.path.join(project_path, '02 Data', 'Prepared Data')


In [6]:
# Load df_ords
df_ords = pd.read_csv(os.path.join(prepared_data_path, 'orders_wrangled.csv'))

# Check the first few rows to confirm it loaded correctly
print(df_ords.head())


   order_id  user_id  order_number  order_day_of_week  order_hour_of_day  \
0   2539329        1             1                  2                  8   
1   2398795        1             2                  3                  7   
2    473747        1             3                  3                 12   
3   2254736        1             4                  4                  7   
4    431534        1             5                  4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  


## Import the New Dataset (orders_products_prior.csv)

In [10]:
# Load orders_products_prior
df_ords_prior = pd.read_csv(os.path.join(project_path, '02 Data', 'Original Data', 'orders_products_prior.csv'))

# Preview the first few rows
print(df_ords_prior.head())

# Check the shape of the dataset
print(df_ords_prior.shape)


   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0
(32434489, 4)


## Import the Prepared Orders Data (df_ords)

In [12]:
# Load prepared orders data
df_ords = pd.read_csv(os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

# Preview the data
print(df_ords.head())

# Check the shape of the dataset
print(df_ords.shape)


   order_id  user_id  order_number  order_day_of_week  order_hour_of_day  \
0   2539329        1             1                  2                  8   
1   2398795        1             2                  3                  7   
2    473747        1             3                  3                 12   
3   2254736        1             4                  4                  7   
4    431534        1             5                  4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  
(3421083, 6)


## Merge the Two DataFrames

In [14]:
# Merge df_ords and df_ords_prior
df_merged_large = df_ords.merge(df_ords_prior, on='order_id', indicator=True)

# Preview the merged data
print(df_merged_large.head())

# Check the shape of the merged DataFrame
print(df_merged_large.shape)


   order_id  user_id  order_number  order_day_of_week  order_hour_of_day  \
0   2539329        1             1                  2                  8   
1   2539329        1             1                  2                  8   
2   2539329        1             1                  2                  8   
3   2539329        1             1                  2                  8   
4   2539329        1             1                  2                  8   

   days_since_prior_order  product_id  add_to_cart_order  reordered _merge  
0                     NaN         196                  1          0   both  
1                     NaN       14084                  2          0   both  
2                     NaN       12427                  3          0   both  
3                     NaN       26088                  4          0   both  
4                     NaN       26405                  5          0   both  
(32434489, 10)


In [15]:
# Analyze the merge flag
print(df_merged_large['_merge'].value_counts())


_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64


## Double-Check with an Outer Join

In [18]:
# Perform an outer join to double-check the merge
df_merged_check = df_ords.merge(df_ords_prior, on='order_id', how='outer', indicator=True)

# Check the merge flag for outer join
print(df_merged_check['_merge'].value_counts())


_merge
both          32434489
left_only       206209
right_only           0
Name: count, dtype: int64


## Save the Merged DataFrame

In [20]:
# Export the merged DataFrame to CSV
df_merged_large.to_csv(os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.csv'), index=False)

# Export the merged DataFrame to Pickle
df_merged_large.to_pickle(os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

print("Merged data exported successfully!")


Merged data exported successfully!


# Step 2: Export the merged file in pickle format as “orders_products_combined.pkl”

In [22]:
df_merged_large.to_pickle(os.path.join(prepared_data_path, 'orders_products_combined.pkl'))
print("Pickle file saved successfully!")


Pickle file saved successfully!
