# 01. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [2]:
# Python shortcut - reusable path
path = r'C:\Users\thoma\OneDrive\Dokumente\data analytics\Data Project 4\06-2025 Instacart Basket Analysis'

In [3]:
path

'C:\\Users\\thoma\\OneDrive\\Dokumente\\data analytics\\Data Project 4\\06-2025 Instacart Basket Analysis'

In [4]:
# Importing dataframe
df_merged_filtered_final = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'Pr_merged_filtered_task4.10(1).pkl'))

In [5]:
# Frequency check
df_merged_filtered_final['Age'].value_counts(dropna = False)

Age
NaN     1340113
48.0     382739
31.0     378955
79.0     378911
64.0     377797
         ...   
60.0     342716
65.0     339767
66.0     338049
25.0     331688
41.0     331677
Name: count, Length: 65, dtype: int64

In [6]:
# Cleaning data
# Creating a subset consisting of the non-missing values
df_merged_filtered_final_cleaned = df_merged_filtered_final[df_merged_filtered_final['Age'].isnull() == False]

In [7]:
# Frequency check
df_merged_filtered_final_cleaned['Age'].value_counts(dropna = False)

Age
48.0    382739
31.0    378955
79.0    378911
64.0    377797
49.0    377303
         ...  
60.0    342716
65.0    339767
66.0    338049
25.0    331688
41.0    331677
Name: count, Length: 64, dtype: int64

In [8]:
# Frequency check
df_merged_filtered_final_cleaned['income'].value_counts(dropna = False)

income
72344.0     4084
57501.0     3945
112610.0    3764
115620.0    3595
36163.0     3530
            ... 
89248.0        1
52568.0        1
58619.0        1
67318.0        1
119963.0       1
Name: count, Length: 93182, dtype: int64

In [9]:
# Frequency check
df_merged_filtered_final_cleaned['n_dependants'].value_counts(dropna = False)

n_dependants
3.0    5794501
2.0    5773629
0.0    5772373
1.0    5756848
Name: count, dtype: int64

In [10]:
# Frequency check
df_merged_filtered_final_cleaned['product_name'].value_counts(dropna = False)

product_name
Banana                                                  339323
Bag of Organic Bananas                                  286170
Organic Strawberries                                    191406
Organic Baby Spinach                                    174533
Organic Hass Avocado                                    160349
                                                         ...  
Hot Cocoa Mix, Rich Chocolate Flavor, No Sugar Added         1
Fruit Thin Drizzled Banana Cookies                           1
Vanilla with Twix Pieces Lowfat Yogurt                       1
Organic Mixed Berry                                          1
Flavored Vodka, Twist of Raspberry                           1
Name: count, Length: 49341, dtype: int64

In [11]:
df_merged_filtered_final_cleaned.columns

Index(['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'add_to_cart_order',
       'reordered', '_merge1', 'product_name', 'aisle_id', 'department_id',
       'prices', '_merge2', 'Busiest days', 'busiest_period_of_day',
       'max_order', 'loyalty_flag', 'average_price', 'order_frequency',
       'First Name', 'Gender', 'STATE', 'Age', 'date_joined', 'n_dependants',
       'fam_status', 'income', '_merge', 'Region', 'exclusion_flag'],
      dtype='object')

In [12]:
# top 10 most expensive products
top_expensive = df_merged_filtered_final_cleaned[['product_name', 'prices']].sort_values(by='prices', ascending=False).drop_duplicates().head(10)
print(top_expensive)

                              product_name  prices
2241659   Boneless Skinless Chicken Thighs    25.0
16058295     Naturally Smoked Trout Fillet    25.0
9642686                    Chicken Tenders    25.0
14218303            Wild Caught Raw Shrimp    25.0
24463565         Turkey Breast Tenderloins    25.0
19652569    Uncured Applewood Smoked Bacon    24.9
1083704            Smok Cured Turkey Bacon    24.9
14395928                     Lobster Tails    24.9
16240282          Sugar Free Dry Rub Bacon    24.9
13012115                  Angus Roast Beef    24.9


In [13]:
# 5) Creating a profiling variable based on age, income, certain goods in the “department_id” column, and number of dependents
def classify_customer(row):
    age = row['Age']
    income = row['income']
    n_dep = row['n_dependants']
    product = row['product_name']
    
    if age >= 30 and income >= 3000 and n_dep <= 1 and product in ['Boneless Skinless Chicken Thighs', 'Naturally Smoked Trout Fillet', 'Chicken Tenders', 'Wild Caught Raw Shrimp', 'Turkey Breast Tenderloins', 'Uncured Applewood Smoked Bacon', 'Smok Cured Turkey Bacon', 'Lobster Tails', 'Sugar Free Dry Rub Bacon', 'Angus Roast Beef']:
        return 'High profile customer'
    elif age < 30 or income < 3000 or n_dep > 1:
        return 'Low profile customer'
    else:
        return 'Other'

In [14]:
df_merged_filtered_final_cleaned['customer_profile'] = df_merged_filtered_final_cleaned.apply(classify_customer, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_filtered_final_cleaned['customer_profile'] = df_merged_filtered_final_cleaned.apply(classify_customer, axis=1)


In [15]:
df_merged_filtered_final_cleaned.shape

(23097351, 33)

In [16]:
df_merged_filtered_final_cleaned['customer_profile'].value_counts(dropna = False)

customer_profile
Low profile customer     13734199
Other                     9357610
High profile customer        5542
Name: count, dtype: int64

In [17]:
# Exporting the dataframe
df_merged_filtered_final_cleaned.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'Pr_merged_filtered_cleaned_newvar_task4.10(2).pkl'))