<a href="https://colab.research.google.com/github/DATA601-CustSegment-Nov2025-to-Feb2026/New-CustSegment-Project/blob/main/Customer_Segmentation_(Preprocessing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries & Load Datasets

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Paths to the final merged dataset stored in Google Drive
final_merged_path = '/content/drive/MyDrive/0. Colab Notebooks (DATA 60X Project)/customer_segmentation_after_merging.csv'

# Read the dataset from its CSV file
final_merged_df = pd.read_csv(final_merged_path, low_memory=False)

# Overview of the Dataset

In [4]:
# Show all columns
pd.set_option('display.max_columns', None)

# Display first 5 rows of the dataset
final_merged_df.head(5)

Unnamed: 0,household_key,basket_id,day,product_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,display,mailer,coupon_upc,campaign,description,start_day,end_day
0,1228,31553560338,308,10118257,8,57.52,327,-6.4,1113,45,-55.93,0.0,45-54,U,100-124K,Unknown,Single Female,1.0,None/Unknown,69,MEAT-PCKGD,Private,MEAT - MISC,GRND/PATTY - FROZEN,2LB,9.0,0,,,,,
1,679,32957147025,403,5978656,0,0.0,447,0.0,1826,58,-37.93,0.0,,,,,,,,1,,National,,,,,,,,,,
2,679,32957147063,403,5978656,0,0.0,447,0.0,1837,58,-36.5,0.0,,,,,,,,1,,National,,,,,,,,,,
3,1901,33919382569,465,13511457,24,60.0,319,-52.56,1613,67,-34.0,0.0,45-54,U,35-49K,Homeowner,2 Adults Kids,3.0,1,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,7.0,D,,,,,
4,400,41124522239,627,5978656,0,0.0,388,0.0,1508,90,-31.46,0.0,35-44,A,150-174K,Homeowner,2 Adults Kids,3.0,1,1,,National,,,,,,,,,,


In [5]:
# Calculate the number of rows and columns in 'final_merged_df'
num_rows_final_merged_df,num_cols_final_merged_df = final_merged_df.shape

print("Number of rows in final_merged_df:", num_rows_final_merged_df)
print("Number of columns in final_merged_df:", num_cols_final_merged_df)

Number of rows in final_merged_df: 2595732
Number of columns in final_merged_df: 32


In [6]:
# Display the data type of each column in the 'final_merged_df'
print(final_merged_df.dtypes)

household_key             int64
basket_id                 int64
day                       int64
product_id                int64
quantity                  int64
sales_value             float64
store_id                  int64
retail_disc             float64
trans_time                int64
week_no                   int64
coupon_disc             float64
coupon_match_disc       float64
age_desc                 object
marital_status_code      object
income_desc              object
homeowner_desc           object
hh_comp_desc             object
household_size_desc      object
kid_category_desc        object
manufacturer              int64
department               object
brand                    object
commodity_desc           object
sub_commodity_desc       object
curr_size_of_product     object
display                  object
mailer                   object
coupon_upc              float64
campaign                float64
description              object
start_day               float64
end_day 

In [7]:
# Show the number of duplicate rows
num_duplicates_final_merged_df = final_merged_df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates_final_merged_df}")

Number of duplicate rows: 0


In [8]:
# Count missing values in each column of the 'final_merged_df'
missing_counts_final_merged_df = final_merged_df.isna().sum()
print(missing_counts_final_merged_df)

household_key                 0
basket_id                     0
day                           0
product_id                    0
quantity                      0
sales_value                   0
store_id                      0
retail_disc                   0
trans_time                    0
week_no                       0
coupon_disc                   0
coupon_match_disc             0
age_desc                1168429
marital_status_code     1168429
income_desc             1168429
homeowner_desc          1168429
hh_comp_desc            1168429
household_size_desc     1168429
kid_category_desc       1168429
manufacturer                  0
department                    0
brand                         0
commodity_desc                0
sub_commodity_desc            0
curr_size_of_product          0
display                 2031958
mailer                  2031958
coupon_upc              2591522
campaign                2591522
description             2591522
start_day               2591522
end_day 

# Create New Variables

## 1. Time-Of-Day and Cyclical Encoding

In [9]:
# Convert 'trans_time' column to string and pad with zeros to ensure 4 digits
final_merged_df['trans_time'] = final_merged_df['trans_time'].astype(str).str.zfill(4)

# Convert 'trans_time' column in HHMM format
final_merged_df['trans_time_dt'] = pd.to_datetime(final_merged_df['trans_time'], format='%H%M')

# Convert 'trans_time_dt' to total minutes for cyclical encoding
final_merged_df['minutes'] = (
    final_merged_df['trans_time_dt'].dt.hour * 60 +
    final_merged_df['trans_time_dt'].dt.minute
)

# Cyclical encoding
final_merged_df['trans_time_sin'] = np.sin(2 * np.pi * final_merged_df['minutes'] / 1440)
final_merged_df['trans_time_cos'] = np.cos(2 * np.pi * final_merged_df['minutes'] / 1440)

In [10]:
# Create a function to categorize time-of-day category based on hour
def categorize_time_of_day(dt):
    hour = dt.hour
    if 0 <= hour < 6:
        return 'Midnight'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:  # 21 <= hour <= 23
        return 'Night'

# Apply function to create new column: 'trans_time_dt'
final_merged_df['trans_time_day'] = final_merged_df['trans_time_dt'].apply(categorize_time_of_day)

In [11]:
# Calculate the number of rows and columns in 'final_merged_df'
num_rows_final_merged_df,num_cols_final_merged_df = final_merged_df.shape

print("Number of rows in final_merged_df:", num_rows_final_merged_df)
print("Number of columns in final_merged_df:", num_cols_final_merged_df)

Number of rows in final_merged_df: 2595732
Number of columns in final_merged_df: 37


In [12]:
# Display first 5 rows of the dataset
final_merged_df.head(5)

Unnamed: 0,household_key,basket_id,day,product_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,display,mailer,coupon_upc,campaign,description,start_day,end_day,trans_time_dt,minutes,trans_time_sin,trans_time_cos,trans_time_day
0,1228,31553560338,308,10118257,8,57.52,327,-6.4,1113,45,-55.93,0.0,45-54,U,100-124K,Unknown,Single Female,1.0,None/Unknown,69,MEAT-PCKGD,Private,MEAT - MISC,GRND/PATTY - FROZEN,2LB,9.0,0,,,,,,1900-01-01 11:13:00,673,0.203642,-0.979045,Morning
1,679,32957147025,403,5978656,0,0.0,447,0.0,1826,58,-37.93,0.0,,,,,,,,1,,National,,,,,,,,,,,1900-01-01 18:26:00,1106,-0.993572,0.113203,Evening
2,679,32957147063,403,5978656,0,0.0,447,0.0,1837,58,-36.5,0.0,,,,,,,,1,,National,,,,,,,,,,,1900-01-01 18:37:00,1117,-0.986996,0.160743,Evening
3,1901,33919382569,465,13511457,24,60.0,319,-52.56,1613,67,-34.0,0.0,45-54,U,35-49K,Homeowner,2 Adults Kids,3.0,1,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,7.0,D,,,,,,1900-01-01 16:13:00,973,-0.892979,-0.450098,Afternoon
4,400,41124522239,627,5978656,0,0.0,388,0.0,1508,90,-31.46,0.0,35-44,A,150-174K,Homeowner,2 Adults Kids,3.0,1,1,,National,,,,,,,,,,,1900-01-01 15:08:00,908,-0.731354,-0.681998,Afternoon


In [13]:
unique_count = final_merged_df['week_no'].nunique()
print(unique_count)

102


In [14]:
unique_count = final_merged_df['day'].nunique()
print(unique_count)

711
