In [23]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# 1. Load and copy data
df = pd.read_csv(r'C:\Users\hp\Desktop\Retail-App-Analytics\data\retail_app_data.csv')

In [24]:
df_processed= df.copy()

In [25]:
df_processed.head()

Unnamed: 0,user_id,first_visit_date,age,platform,screen_list,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,made_purchase,purchase_date,user_segment,region,acquisition_channel,app_version
0,100000,2023-09-19,66,Android,"CategoryBrowse,Search,WishList,ReturnPolicy,Pr...",5,10,1,1,0,1,2023-09-19 01:00:00,Senior Buyer,Europe,Referral,2.0.8
1,100001,2023-04-29,68,iOS,"OrderTracking,Promotions,Notifications,Product...",8,39,0,0,0,0,,Senior Browser,Latin America,Organic Search,2.0.5
2,100002,2023-10-20,25,iOS,"ShoppingCart,WishList,ProductList,PaymentMetho...",9,19,1,0,1,0,,Adult Browser,Europe,Social Media,2.0.6
3,100003,2023-07-04,39,Android,"Checkout,ColorPicker,OrderTracking,WishList,Or...",8,47,0,0,0,0,,Adult Browser,Asia Pacific,Social Media,2.2.1
4,100004,2023-01-02,28,iOS,"ColorPicker,ShoppingCart,Reviews,Account,Categ...",9,29,0,1,1,0,,Adult Browser,Latin America,Paid Search,2.2.8


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               10000 non-null  int64 
 1   first_visit_date      10000 non-null  object
 2   age                   10000 non-null  int64 
 3   platform              10000 non-null  object
 4   screen_list           10000 non-null  object
 5   session_count         10000 non-null  int64 
 6   total_screens_viewed  10000 non-null  int64 
 7   used_search_feature   10000 non-null  int64 
 8   wrote_review          10000 non-null  int64 
 9   added_to_wishlist     10000 non-null  int64 
 10  made_purchase         10000 non-null  int64 
 11  purchase_date         5289 non-null   object
 12  user_segment          10000 non-null  object
 13  region                10000 non-null  object
 14  acquisition_channel   10000 non-null  object
 15  app_version           10000 non-null 

### Time-based Processing Description

1. **Convert datetime columns**  
   - Converts the `first_visit_date` and `purchase_date` columns to `datetime` objects.  
   - Enables date-based operations such as calculating differences and extracting specific date components.

2. **Calculate time difference and create target**  
   - Computes the time elapsed between `purchase_date` and `first_visit_date` in hours.  
   - **Formula**:  
     $$
     \text{time\_to\_purchase} = \frac{(\text{purchase\_date} - \text{first\_visit\_date}).\text{total\_seconds}}{3600}
     $$
   - Creates a binary target variable `purchase_24h`:  
     - `1` if the purchase occurred within 24 hours of the first visit.  
     - `0` otherwise.  
   - **Formula**:  
     $$
     \text{purchase\_24h} = 
     \begin{cases} 
     1 & \text{if } \text{time\_to\_purchase} \leq 24 \\ 
     0 & \text{otherwise}
     \end{cases}
     $$

3. **Extract time features**  
   - Extracts temporal features from the `first_visit_date`:  
     - **Hour**:  
       $$
       \text{hour} = \text{first\_visit\_date.hour}
       $$
     - **Day of the week**:  
       $$
       \text{dayofweek} = \text{first\_visit\_date.dayofweek}
       $$
     - **Weekend indicator**:  
       $$
       \text{is\_weekend} = 
       \begin{cases} 
       1 & \text{if dayofweek is Saturday (5) or Sunday (6)} \\ 
       0 & \text{otherwise}
       \end{cases}
       $$

In [27]:
# 2. Time-based Processing
# Convert datetime columns
df_processed['first_visit_date'] = pd.to_datetime(df_processed['first_visit_date'])
df_processed['purchase_date'] = pd.to_datetime(df_processed['purchase_date'])

# Calculate time difference and create target
df_processed['time_to_purchase'] = (df_processed['purchase_date'] - 
                                  df_processed['first_visit_date']).dt.total_seconds() / 3600

# Create 24-hour purchase target
df_processed['purchase_24h'] = np.where(df_processed['time_to_purchase'] <= 24, 1, 0)

# Extract time features
df_processed['hour'] = df_processed['first_visit_date'].dt.hour
df_processed['dayofweek'] = df_processed['first_visit_date'].dt.dayofweek
df_processed['is_weekend'] = df_processed['dayofweek'].isin([5,6]).astype(int)



In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

In [29]:
df_processed.head()

Unnamed: 0,user_id,first_visit_date,age,platform,screen_list,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,made_purchase,purchase_date,user_segment,region,acquisition_channel,app_version,time_to_purchase,purchase_24h,hour,dayofweek,is_weekend
0,100000,2023-09-19,66,Android,"CategoryBrowse,Search,WishList,ReturnPolicy,Pr...",5,10,1,1,0,1,2023-09-19 01:00:00,Senior Buyer,Europe,Referral,2.0.8,1.0,1,0,1,0
1,100001,2023-04-29,68,iOS,"OrderTracking,Promotions,Notifications,Product...",8,39,0,0,0,0,NaT,Senior Browser,Latin America,Organic Search,2.0.5,,0,0,5,1
2,100002,2023-10-20,25,iOS,"ShoppingCart,WishList,ProductList,PaymentMetho...",9,19,1,0,1,0,NaT,Adult Browser,Europe,Social Media,2.0.6,,0,0,4,0
3,100003,2023-07-04,39,Android,"Checkout,ColorPicker,OrderTracking,WishList,Or...",8,47,0,0,0,0,NaT,Adult Browser,Asia Pacific,Social Media,2.2.1,,0,0,1,0
4,100004,2023-01-02,28,iOS,"ColorPicker,ShoppingCart,Reviews,Account,Categ...",9,29,0,1,1,0,NaT,Adult Browser,Latin America,Paid Search,2.2.8,,0,0,0,0


In [30]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               10000 non-null  int64         
 1   first_visit_date      10000 non-null  datetime64[ns]
 2   age                   10000 non-null  int64         
 3   platform              10000 non-null  object        
 4   screen_list           10000 non-null  object        
 5   session_count         10000 non-null  int64         
 6   total_screens_viewed  10000 non-null  int64         
 7   used_search_feature   10000 non-null  int64         
 8   wrote_review          10000 non-null  int64         
 9   added_to_wishlist     10000 non-null  int64         
 10  made_purchase         10000 non-null  int64         
 11  purchase_date         5289 non-null   datetime64[ns]
 12  user_segment          10000 non-null  object        
 13  region           

## **Screen List Processing - Detailed Explanation**

This section processes the `screen_list` column to categorize user interactions into different screen types and extract meaningful features.

### **Step 1: Ensuring Consistency in Screen List Processing**
```python
df_processed['screen_list'] = df_processed['screen_list'].astype(str) + ','
```
- Ensures that every value in the `screen_list` column is a string and appends a **comma (`','`)** to each entry.
- This makes it easier to search for screen names reliably, avoiding substring matching errors.


### **Step 2: Defining Screen Categories**
```python
shopping_screens = ['ProductList', 'ProductDetail', 'CategoryBrowse', 'Search']
cart_screens = ['ShoppingCart', 'Checkout', 'PaymentMethods', 'DeliveryOptions']
engagement_screens = ['WishList', 'Reviews', 'Promotions']
account_screens = ['Account', 'AddressBook', 'OrderTracking']
```
- Screens are categorized into **four groups**:
  - **Shopping Screens** → Related to browsing and searching for products.
  - **Cart Screens** → Related to purchasing and checkout.
  - **Engagement Screens** → Related to wishlists, reviews, and promotions.
  - **Account Screens** → Related to user accounts and order tracking.

### **Step 3: Creating Binary Indicators for Each Screen**
```python
for screen in (shopping_screens + cart_screens + engagement_screens + account_screens):
    df_processed[screen.lower()] = df_processed['screen_list'].str.contains(screen).astype(int)
```
- Creates a **new column** for each screen name in lowercase.
- **`df_processed['screen_list'].str.contains(screen).astype(int)`**:
  - **Checks if the screen name appears** in `screen_list` (returns `True`/`False`).
  - **Converts `True` to 1 and `False` to 0**, creating a binary feature.
- Example: If `screen_list` contains `'ProductList,ShoppingCart'`, then:
  - `df_processed['productlist'] = 1`
  - `df_processed['shoppingcart'] = 1`
  - Other screen columns remain `0`.

### **Step 4: Creating Count Features for Each Category**
```python
df_processed['shopping_count'] = df_processed[[s.lower() for s in shopping_screens]].sum(axis=1)
df_processed['cart_count'] = df_processed[[s.lower() for s in cart_screens]].sum(axis=1)
df_processed['engagement_count'] = df_processed[[s.lower() for s in engagement_screens]].sum(axis=1)
df_processed['account_count'] = df_processed[[s.lower() for s in account_screens]].sum(axis=1)
```
- Computes each category's count by summing the binary indicators from **Step 3**.
- Provides a **numerical representation** of how often a user interacts with different types of screens.
- Example: If a user visited `'ProductList'` and `'CategoryBrowse'`, then:
  - `df_processed['shopping_count'] = 2`
  - Other counts remain unchanged.


### **Step 5: Creating an "Other Screens" Category**
```python
all_tracked_screens = shopping_screens + cart_screens + engagement_screens + account_screens
df_processed['other_screens'] = df_processed['screen_list'].apply(
    lambda x: len([s for s in x.split(',') if s and s not in all_tracked_screens])
)
```
- **Identifies screens that do not belong to any predefined category**.
- **`x.split(',')`** → Splits the `screen_list` into individual screen names.
- **List comprehension filters out known screens**, counting only unknown screens.
- The **final count** is stored in `df_processed['other_screens']`.


### **Final Output**
After running this code, `df_processed` will contain:
1. **Binary indicators** (0/1) for each screen name.
2. **Count columns** for each screen category.
3. **A count of "other" screens** that don't fit predefined categories.

This structured approach allows for better **feature engineering** when analyzing user behavior based on screen interactions.

# ______________________Another Explanation__________________________

This code processes a dataset (df_processed) containing user screen interactions and extracts meaningful insights by categorizing and counting screen visits. Let me break it down step by step with a simple example.

⸻

Step-by-Step Explanation:

1. Ensure Consistent Processing by Adding a Comma

df_processed['screen_list'] = df_processed['screen_list'].astype(str) + ','

	•	Ensures that each entry in screen_list (a list of screens visited by a user) always ends with a comma.
	•	This makes it easier to search for screen names later.

📌 Example Input (df_processed['screen_list']):

User  | screen_list
------|--------------------------------------
A     | "ProductList,ShoppingCart,WishList"
B     | "Account,OrderTracking,PaymentMethods"

📌 After Processing (df_processed['screen_list']):

User  | screen_list
------|----------------------------------------------
A     | "ProductList,ShoppingCart,WishList,"
B     | "Account,OrderTracking,PaymentMethods,"



⸻

2. Define Categories of Screens

- shopping_screens = ['ProductList', 'ProductDetail', 'CategoryBrowse', 'Search']

- cart_screens = ['ShoppingCart', 'Checkout', 'PaymentMethods', 'DeliveryOptions']
- engagement_screens = ['WishList', 'Reviews', 'Promotions']
-  = ['Account', 'AddressBook', 'OrderTracking']

	•	Defines different screen categories based on their purpose.
	•	Example:
	•	Shopping-related screens: ProductList, Search
	•	Cart-related screens: ShoppingCart, Checkout
	•	Engagement screens: WishList, Reviews
	•	Account-related screens: Account, OrderTracking

⸻

3. Create Binary Indicators for Each Screen

for screen in (shopping_screens + cart_screens + engagement_screens + account_screens):
    df_processed[screen.lower()] = df_processed['screen_list'].str.contains(screen).astype(int)

	•	Creates a new column for each screen name, setting 1 if the user visited that screen, else 0.

📌 Example Output:

User  | screen_list                           | productlist | shoppingcart | wishlist | account | ordertracking
------|---------------------------------------|-------------|-------------|----------|---------|--------------
A     | "ProductList,ShoppingCart,WishList," | 1           | 1           | 1        | 0       | 0
B     | "Account,OrderTracking,PaymentMethods," | 0           | 0           | 0        | 1       | 1



⸻

4. Count Total Visits for Each Category

df_processed['shopping_count'] = df_processed[[s.lower() for s in shopping_screens]].sum(axis=1)
df_processed['cart_count'] = df_processed[[s.lower() for s in cart_screens]].sum(axis=1)
df_processed['engagement_count'] = df_processed[[s.lower() for s in engagement_screens]].sum(axis=1)
df_processed['account_count'] = df_processed[[s.lower() for s in account_screens]].sum(axis=1)

	•	Counts how many screens a user visited in each category.

📌 Example Output:

User  | shopping_count | cart_count | engagement_count | account_count
------|---------------|------------|------------------|--------------
A     | 1             | 1          | 1                | 0
B     | 0             | 1          | 0                | 2



⸻

5. Identify “Other” Screens Not in Defined Categories

all_tracked_screens = shopping_screens + cart_screens + engagement_screens + account_screens
df_processed['other_screens'] = df_processed['screen_list'].apply(
    lambda x: len([s for s in x.split(',') if s and s not in all_tracked_screens])
)

	•	Counts how many screens the user visited that are NOT part of the predefined categories.

📌 Example:

User  | screen_list                                  | other_screens
------|--------------------------------------------|--------------
A     | "ProductList,ShoppingCart,WishList,Home"  | 1
B     | "Account,OrderTracking,PaymentMethods,FAQ" | 1

	•	User A visited Home, which is not in the predefined categories, so other_screens = 1.
	•	User B visited FAQ, which is also not tracked, so other_screens = 1.

⸻

Final Output Table

User  | screen_list                            | productlist | shoppingcart | wishlist | account | ordertracking | shopping_count | cart_count | engagement_count | account_count | other_screens
------|----------------------------------------|-------------|-------------|----------|---------|--------------|---------------|------------|------------------|--------------|--------------
A     | "ProductList,ShoppingCart,WishList,Home"  | 1           | 1           | 1        | 0       | 0            | 1             | 1          | 1                | 0            | 1
B     | "Account,OrderTracking,PaymentMethods,FAQ" | 0           | 0           | 0        | 1       | 1            | 0             | 1          | 0                | 2            | 1



⸻

Summary

This script extracts screen visit data from screen_list, categorizes screens, and counts visits per category.
It helps analyze user behavior by showing:
✅ Which types of screens users visit most.
✅ How often users visit shopping, cart, engagement, or account-related screens.
✅ Any screens that do not fit into predefined categories.

Would you like any modifications or further explanations? 🚀

In [31]:
# 3. Screen List Processing
# Add comma for consistent processing
df_processed['screen_list'] = df_processed['screen_list'].astype(str) + ','

# Define screen categories
shopping_screens = ['ProductList', 'ProductDetail', 'CategoryBrowse', 'Search']
cart_screens = ['ShoppingCart', 'Checkout', 'PaymentMethods', 'DeliveryOptions']
engagement_screens = ['WishList', 'Reviews', 'Promotions']
account_screens = ['Account', 'AddressBook', 'OrderTracking']

# Create binary indicators for each screen
for screen in (shopping_screens + cart_screens + engagement_screens + account_screens):
    df_processed[screen.lower()] = df_processed['screen_list'].str.contains(screen).astype(int)

# Create count features for each category
df_processed['shopping_count'] = df_processed[[s.lower() for s in shopping_screens]].sum(axis=1)
df_processed['cart_count'] = df_processed[[s.lower() for s in cart_screens]].sum(axis=1)
df_processed['engagement_count'] = df_processed[[s.lower() for s in engagement_screens]].sum(axis=1)
df_processed['account_count'] = df_processed[[s.lower() for s in account_screens]].sum(axis=1)

# Create Other category
all_tracked_screens = shopping_screens + cart_screens + engagement_screens + account_screens
df_processed['other_screens'] = df_processed['screen_list'].apply(
    lambda x: len([s for s in x.split(',') if s and s not in all_tracked_screens])
)



In [32]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200000)
df_processed

Unnamed: 0,user_id,first_visit_date,age,platform,screen_list,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,made_purchase,purchase_date,user_segment,region,acquisition_channel,app_version,time_to_purchase,purchase_24h,hour,dayofweek,is_weekend,productlist,productdetail,categorybrowse,search,shoppingcart,checkout,paymentmethods,deliveryoptions,wishlist,reviews,promotions,account,addressbook,ordertracking,shopping_count,cart_count,engagement_count,account_count,other_screens
0,100000,2023-09-19,66,Android,"CategoryBrowse,Search,WishList,ReturnPolicy,Pr...",5,10,1,1,0,1,2023-09-19 01:00:00,Senior Buyer,Europe,Referral,2.0.8,1.0,1,0,1,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,3,0,1,1,1
1,100001,2023-04-29,68,iOS,"OrderTracking,Promotions,Notifications,Product...",8,39,0,0,0,0,NaT,Senior Browser,Latin America,Organic Search,2.0.5,,0,0,5,1,1,1,0,1,0,1,1,1,0,0,1,0,0,1,3,3,1,1,2
2,100002,2023-10-20,25,iOS,"ShoppingCart,WishList,ProductList,PaymentMetho...",9,19,1,0,1,0,NaT,Adult Browser,Europe,Social Media,2.0.6,,0,0,4,0,1,0,0,0,1,1,1,1,1,1,0,0,1,0,1,4,2,1,1
3,100003,2023-07-04,39,Android,"Checkout,ColorPicker,OrderTracking,WishList,Or...",8,47,0,0,0,0,NaT,Adult Browser,Asia Pacific,Social Media,2.2.1,,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,1,1,2,2,3
4,100004,2023-01-02,28,iOS,"ColorPicker,ShoppingCart,Reviews,Account,Categ...",9,29,0,1,1,0,NaT,Adult Browser,Latin America,Paid Search,2.2.8,,0,0,0,0,1,0,1,0,1,1,0,0,0,1,0,1,0,1,2,2,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109995,2023-12-04,51,iOS,"DeliveryOptions,Notifications,ProductDetail,Co...",9,22,1,1,0,0,NaT,Senior Browser,North America,Paid Search,2.2.9,,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,2
9996,109996,2023-05-01,28,Android,"Reviews,Search,SizeGuide,SizeGuide,Account,",2,11,1,1,1,1,2023-05-01 17:00:00,Adult Buyer,Latin America,Organic Search,2.1.1,17.0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,1,1,2
9997,109997,2023-02-23,56,iOS,"DeliveryOptions,ProductList,Checkout,AddressBo...",4,23,1,0,0,1,2023-02-23 01:00:00,Senior Buyer,Europe,Social Media,2.1.1,1.0,1,0,3,0,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,2,1,3,2
9998,109998,2023-07-08,33,iOS,"StoreLocator,ProductList,CustomerService,Shopp...",8,13,1,0,0,0,NaT,Adult Browser,Asia Pacific,Organic Search,1.1.3,,0,0,5,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,2


In [33]:
print(df_processed.columns)

Index(['user_id', 'first_visit_date', 'age', 'platform', 'screen_list', 'session_count', 'total_screens_viewed', 'used_search_feature', 'wrote_review', 'added_to_wishlist', 'made_purchase', 'purchase_date', 'user_segment', 'region', 'acquisition_channel', 'app_version', 'time_to_purchase', 'purchase_24h', 'hour', 'dayofweek', 'is_weekend', 'productlist', 'productdetail', 'categorybrowse', 'search', 'shoppingcart', 'checkout', 'paymentmethods', 'deliveryoptions', 'wishlist', 'reviews', 'promotions', 'account', 'addressbook', 'ordertracking', 'shopping_count', 'cart_count', 'engagement_count', 'account_count', 'other_screens'], dtype='object')


In [34]:
df_processed[['user_id',	'first_visit_date',	'age','added_to_wishlist', 'made_purchase', 'purchase_date', 'shoppingcart', 'checkout', 'paymentmethods', 'deliveryoptions', 'wishlist', 'reviews', 'promotions', 'account']]

Unnamed: 0,user_id,first_visit_date,age,added_to_wishlist,made_purchase,purchase_date,shoppingcart,checkout,paymentmethods,deliveryoptions,wishlist,reviews,promotions,account
0,100000,2023-09-19,66,0,1,2023-09-19 01:00:00,0,0,0,0,1,0,0,1
1,100001,2023-04-29,68,0,0,NaT,0,1,1,1,0,0,1,0
2,100002,2023-10-20,25,1,0,NaT,1,1,1,1,1,1,0,0
3,100003,2023-07-04,39,0,0,NaT,0,1,0,0,1,0,1,1
4,100004,2023-01-02,28,1,0,NaT,1,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109995,2023-12-04,51,0,0,NaT,0,0,0,1,0,0,0,0
9996,109996,2023-05-01,28,1,1,2023-05-01 17:00:00,0,0,0,0,0,1,0,1
9997,109997,2023-02-23,56,0,1,2023-02-23 01:00:00,0,1,0,1,0,0,1,1
9998,109998,2023-07-08,33,0,0,NaT,1,0,0,0,0,0,0,0


### Feature Engineering Description

1. **Create engagement score**  
   - Constructs an `engagement_score` metric by combining various user activity features.  
   - Each feature is weighted based on its importance:  
     - `session_count` (30%)  
     - `used_search_feature` (20%)  
     - `wrote_review` (15%)  
     - `added_to_wishlist` (15%)  
     - `total_screens_viewed` (20%)  
   - **Formula**:  
     $$
     \text{engagement\_score} = 
     (\text{session\_count} \times 0.3) + 
     (\text{used\_search\_feature} \times 0.2) + 
     (\text{wrote\_review} \times 0.15) + 
     (\text{added\_to\_wishlist} \times 0.15) + 
     (\text{total\_screens\_viewed} \times 0.2)
     $$

2. **Create screen diversity score**  
   - Measures the diversity of user interactions across different types of screens.  
   - Checks whether each of the following screen-related features is greater than zero:  
     - `shopping_count`  
     - `cart_count`  
     - `engagement_count`  
     - `account_count`  
   - Sums the number of screens interacted with, giving a score for screen diversity.  
   - **Formula**:  
     $$
     \text{screen\_diversity} = \text{Count of non-zero values in } [\text{shopping\_count, cart\_count, engagement\_count, account\_count}]
     $$

3. **Create purchase intent score**  
   - Constructs a `purchase_intent` score to estimate the likelihood of a user making a purchase.  
   - Combines features related to purchase behaviors with specific weights:  
     - `cart_count` (40%)  
     - `shopping_count` (30%)  
     - `engagement_count` (20%)  
     - `added_to_wishlist` (10%)  
   - **Formula**:  
     $$
     \text{purchase\_intent} = 
     (\text{cart\_count} \times 0.4) + 
     (\text{shopping\_count} \times 0.3) + 
     (\text{engagement\_count} \times 0.2) + 
     (\text{added\_to\_wishlist} \times 0.1)
     $$

# MOre explaination

This code is performing Feature Engineering, which means it is creating new numerical features based on existing ones to better understand user behavior. Below is a step-by-step explanation with examples.

⸻

1. Creating the Engagement Score

df_processed['engagement_score'] = (
    df_processed['session_count'] * 0.3 +
    df_processed['used_search_feature'] * 0.2 +
    df_processed['wrote_review'] * 0.15 +
    df_processed['added_to_wishlist'] * 0.15 +
    df_processed['total_screens_viewed'] * 0.2
)

What does this do?
	•	It calculates an engagement score by weighting different user activities that indicate engagement.
	•	More engaged users:
	•	Have more sessions (session_count)
	•	Use the search feature
	•	Write reviews
	•	Add products to their wishlist
	•	View more screens

📌 Example Data Before Processing

User  | session_count | used_search_feature | wrote_review | added_to_wishlist | total_screens_viewed
------|--------------|---------------------|--------------|--------------------|----------------------
A     | 5            | 1                   | 0            | 2                  | 10
B     | 2            | 0                   | 1            | 1                  | 5

📌 Engagement Score Calculation
For User A:

(5 * 0.3) + (1 * 0.2) + (0 * 0.15) + (2 * 0.15) + (10 * 0.2)
= 1.5 + 0.2 + 0 + 0.3 + 2.0
= **4.0**

For User B:

(2 * 0.3) + (0 * 0.2) + (1 * 0.15) + (1 * 0.15) + (5 * 0.2)
= 0.6 + 0 + 0.15 + 0.15 + 1.0
= **1.9**

📌 Final Data After Processing

User  | session_count | used_search_feature | wrote_review | added_to_wishlist | total_screens_viewed | engagement_score
------|--------------|---------------------|--------------|--------------------|----------------------|-----------------
A     | 5            | 1                   | 0            | 2                  | 10                   | 4.0
B     | 2            | 0                   | 1            | 1                  | 5                    | 1.9

✅ Higher scores mean the user is more engaged.

⸻

2. Creating the Screen Diversity Score

df_processed['screen_diversity'] = (
    df_processed[['shopping_count', 'cart_count', 
                 'engagement_count', 'account_count']].gt(0).sum(axis=1)
)

What does this do?
	•	This measures how many different screen categories a user interacted with.
	•	If a user has visited at least one screen from a category, it is counted.

📌 Example Data

User  | shopping_count | cart_count | engagement_count | account_count
------|---------------|------------|------------------|--------------
A     | 2             | 1          | 1                | 0
B     | 0             | 1          | 0                | 1

📌 Screen Diversity Calculation
For User A:
	•	They visited Shopping, Cart, and Engagement categories (3 categories).
	•	Screen Diversity Score = 3.

For User B:
	•	They visited Cart and Account categories (2 categories).
	•	Screen Diversity Score = 2.

📌 Final Output

User  | shopping_count | cart_count | engagement_count | account_count | screen_diversity
------|---------------|------------|------------------|--------------|-----------------
A     | 2             | 1          | 1                | 0            | 3
B     | 0             | 1          | 0                | 1            | 2

✅ Higher values mean a user explored a variety of features in the app.

⸻

3. Creating the Purchase Intent Score

df_processed['purchase_intent'] = (
    df_processed['cart_count'] * 0.4 +
    df_processed['shopping_count'] * 0.3 +
    df_processed['engagement_count'] * 0.2 +
    df_processed['added_to_wishlist'] * 0.1
)

What does this do?
	•	This calculates how likely a user is to make a purchase.
	•	Higher weight for “cart_count” (40%) because users who add items to their cart are closer to buying.
	•	“Shopping” screens (30%) show product interest.
	•	Engagement (20%) and Wishlist (10%) indicate user intent.

📌 Example Data

User  | shopping_count | cart_count | engagement_count | added_to_wishlist
------|---------------|------------|------------------|--------------------
A     | 2             | 3          | 1                | 2
B     | 1             | 1          | 0                | 1

📌 Purchase Intent Calculation
For User A:

(3 * 0.4) + (2 * 0.3) + (1 * 0.2) + (2 * 0.1)
= 1.2 + 0.6 + 0.2 + 0.2
= **2.2**

For User B:

(1 * 0.4) + (1 * 0.3) + (0 * 0.2) + (1 * 0.1)
= 0.4 + 0.3 + 0 + 0.1
= **0.8**

📌 Final Output

User  | shopping_count | cart_count | engagement_count | added_to_wishlist | purchase_intent
------|---------------|------------|------------------|--------------------|----------------
A     | 2             | 3          | 1                | 2                  | 2.2
B     | 1             | 1          | 0                | 1                  | 0.8

✅ Higher values indicate higher likelihood of making a purchase.

⸻

Final Summary

Feature	What It Measures	Example Values
Engagement Score	How engaged a user is (sessions, reviews, wishlist, etc.)	4.0 (high) vs. 1.9 (low)
Screen Diversity	How many types of screens a user visited	3 (high diversity) vs. 2 (low diversity)
Purchase Intent	How likely a user is to buy (cart, shopping, wishlist, etc.)	2.2 (high intent) vs. 0.8 (low intent)

🔹 High “engagement_score” → Active user
🔹 High “screen_diversity” → Exploring different features
🔹 High “purchase_intent” → More likely to buy something

Would you like to add more weights or new features to improve predictions? 🚀

In [35]:
# 4. Feature Engineering
# Create engagement score
df_processed['engagement_score'] = (
    df_processed['session_count'] * 0.3 +
    df_processed['used_search_feature'] * 0.2 +
    df_processed['wrote_review'] * 0.15 +
    df_processed['added_to_wishlist'] * 0.15 +
    df_processed['total_screens_viewed'] * 0.2
)

# Create screen diversity score
df_processed['screen_diversity'] = (
    df_processed[['shopping_count', 'cart_count', 
                 'engagement_count', 'account_count']].gt(0).sum(axis=1)
)

# Create purchase intent score
df_processed['purchase_intent'] = (
    df_processed['cart_count'] * 0.4 +
    df_processed['shopping_count'] * 0.3 +
    df_processed['engagement_count'] * 0.2 +
    df_processed['added_to_wishlist'] * 0.1
)



# Categorical Feature Processing Documentation : HW

## Overview
This document explains the step-by-step process of transforming categorical features in our retail app dataset into numerical formats suitable for machine learning models.

## Processing Steps

### 1. Platform Encoding
```python
df_processed['platform'] = df_processed['platform'].map({'iOS': 1, 'Android': 0})
```

**Purpose**: Convert binary categorical platform data into numerical format
- iOS → 1
- Android → 0

**Rationale**: Simple binary encoding is used because:
- Only two categories exist
- No ordinal relationship needs to be preserved
- Maintains interpretability without adding complexity

### 2. Region Encoding
```python
region_dummies = pd.get_dummies(df_processed['region'], prefix='region')
df_processed = pd.concat([df_processed, region_dummies], axis=1)
```

**Purpose**: Transform region categories into binary columns using one-hot encoding

**Example Output**:
```
region_north   region_south   region_east   region_west
     1              0              0              0    # North
     0              1              0              0    # South
     0              0              1              0    # East
     0              0              0              1    # West
```

**Rationale**:
- Avoids arbitrary numerical rankings
- Preserves categorical nature of the data
- Allows model to treat each region as a distinct category

### 3. Acquisition Channel Encoding
```python
channel_dummies = pd.get_dummies(df_processed['acquisition_channel'], prefix='channel')
df_processed = pd.concat([df_processed, channel_dummies], axis=1)
```

**Purpose**: Convert acquisition channel categories into binary features

**Example Output**:
```
channel_organic   channel_paid   channel_referral
       1                0                0        # Organic
       0                1                0        # Paid
       0                0                1        # Referral
```

**Rationale**:
- Maintains categorical distinction between channels
- Prevents implicit ordinal relationships
- Creates model-compatible numerical features

### 4. User Segment Processing
```python
# Split combined information
df_processed['age_group'] = df_processed['user_segment'].apply(lambda x: x.split()[0])
df_processed['user_type'] = df_processed['user_segment'].apply(lambda x: ' '.join(x.split()[1:]))

# One-hot encode both components
age_group_dummies = pd.get_dummies(df_processed['age_group'], prefix='age_group')
user_type_dummies = pd.get_dummies(df_processed['user_type'], prefix='user_type')
df_processed = pd.concat([df_processed, age_group_dummies, user_type_dummies], axis=1)
```

**Purpose**: 
- Separate compound user segment information
- Create distinct features for age group and user type
- Convert both into binary features

**Rationale**:
- Improves feature granularity
- Enhances model interpretability
- Allows independent analysis of age groups and user types

### 5. App Version Processing
```python
# Extract major version
df_processed['app_major_version'] = df_processed['app_version'].apply(lambda x: int(x.split('.')[0]))

# Create version recency score
df_processed['version_score'] = df_processed['app_version'].apply(
    lambda x: sum(float(i) / (10**n) for n, i in enumerate(x.split('.')))
)
```

**Purpose**: Convert semantic versioning (X.Y.Z) into numerical features

**Approach**:
1. Extract major version number (X)
2. Calculate version score where:
   - Major version has highest weight
   - Minor version has medium weight
   - Patch version has lowest weight

**Rationale**:
- Preserves version hierarchy
- Creates numerical representation of version recency
- Maintains interpretability

### 6. Final Dataset Preparation
```python
# Remove original and redundant columns
columns_to_drop = [
    'screen_list', 'purchase_date', 'first_visit_date', 
    'time_to_purchase', 'made_purchase', 'region', 
    'acquisition_channel', 'user_segment', 'app_version',
    'age_group', 'user_type'
]
df_processed = df_processed.drop(columns=columns_to_drop)

# Standardize column names
df_processed.columns = df_processed.columns.str.lower()
```

**Purpose**: Clean up and finalize the processed dataset

**Actions**:
- Remove original categorical columns
- Remove redundant features
- Standardize column naming convention

## Final Output
The processed dataset now contains:
- Binary platform encoding
- One-hot encoded regions
- One-hot encoded acquisition channels
- Separated and encoded user segments
- Numerical app version features
- All lowercase column names

All features are now in numerical format, ready for machine learning model training.

In [36]:
# 5. Categorical Feature Processing
# Platform encoding (keeping existing)
df_processed['platform'] = df_processed['platform'].map({'iOS': 1, 'Android': 0})

# Process new categorical columns
# Region encoding
region_dummies = pd.get_dummies(df_processed['region'], prefix='region')
df_processed = pd.concat([df_processed, region_dummies], axis=1)

# Acquisition channel encoding
channel_dummies = pd.get_dummies(df_processed['acquisition_channel'], prefix='channel')
df_processed = pd.concat([df_processed, channel_dummies], axis=1)

# User segment processing
# Extract age group and user type separately for more granular analysis
df_processed['age_group'] = df_processed['user_segment'].apply(lambda x: x.split()[0])
df_processed['user_type'] = df_processed['user_segment'].apply(lambda x: ' '.join(x.split()[1:]))

age_group_dummies = pd.get_dummies(df_processed['age_group'], prefix='age_group')
user_type_dummies = pd.get_dummies(df_processed['user_type'], prefix='user_type')
df_processed = pd.concat([df_processed, age_group_dummies, user_type_dummies], axis=1)

# App version processing
# Extract major version for simplified analysis
df_processed['app_major_version'] = df_processed['app_version'].apply(lambda x: int(x.split('.')[0]))

# Create version recency score (higher = newer version)
df_processed['version_score'] = df_processed['app_version'].apply(
    lambda x: sum(float(i)/(10**n) for n, i in enumerate(x.split('.')))
)

# 6. Clean up and prepare final dataset
# Drop original columns that have been processed
columns_to_drop = [
    'screen_list', 'purchase_date', 'first_visit_date', 
    'time_to_purchase', 'made_purchase', 'region', 
    'acquisition_channel', 'user_segment', 'app_version',
    'age_group', 'user_type'
]
df_processed = df_processed.drop(columns=columns_to_drop)

# Ensure all column names are lowercase
df_processed.columns = df_processed.columns.str.lower()



# Quality Checks - Explanation

The quality check step performs three essential verifications:

## Shape Check
```python
print(f"Shape: {df_processed.shape}")
```
This prints the dimensions of our processed dataframe, showing the number of rows and columns. It's crucial for verifying that our transformations haven't unexpectedly altered the dataset size.

## Missing Values Check
```python
print(f"\nNull values:\n{df_processed.isnull().sum()[df_processed.isnull().sum() > 0]}")
```
This identifies any columns containing missing values and shows their count. We need this to ensure all data gaps are handled before model training.

## Purchase Rate Check
```python
print(f"\nPurchase rate (24h): {df_processed['purchase_24h'].mean():.2%}")
```
This calculates the percentage of users who made a purchase within 24 hours. The calculation uses the mean of the 'purchase_24h' column, multiplied by 100 for a percentage value. This gives us insight into conversion rates and helps in performance analysis.

In [37]:
# 7. Quality Checks
print("Data Quality Report")
print("-" * 50)
print(f"Shape: {df_processed.shape}")
print(f"\nNull values:\n{df_processed.isnull().sum()[df_processed.isnull().sum() > 0]}")
print(f"\nPurchase rate (24h): {df_processed['purchase_24h'].mean():.2%}")



Data Quality Report
--------------------------------------------------
Shape: (10000, 52)

Null values:
Series([], dtype: int64)

Purchase rate (24h): 43.73%


In [38]:
# 8. Feature Correlations
correlation_matrix = df_processed.corr()['purchase_24h'].sort_values(ascending=False)
print("\nTop 10 Features by Correlation with Purchase:")
print(correlation_matrix[:10])




Top 10 Features by Correlation with Purchase:
purchase_24h            1.000000
user_type_buyer         0.625666
user_type_power user    0.316745
added_to_wishlist       0.146355
session_count           0.135707
used_search_feature     0.084549
engagement_score        0.042127
dayofweek               0.020391
wrote_review            0.019688
region_middle east      0.018871
Name: purchase_24h, dtype: float64


In [39]:
# 9. Save processed dataset
df_processed.to_csv('retail_processed.csv', index=False)
print("\nProcessed data saved to 'retail_processed_enhanced.csv'")




Processed data saved to 'retail_processed_enhanced.csv'


In [40]:
# Display sample of final dataset
print("\nSample of processed dataset:")
df_processed.head()


Sample of processed dataset:


Unnamed: 0,user_id,age,platform,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,purchase_24h,hour,dayofweek,is_weekend,productlist,productdetail,categorybrowse,search,shoppingcart,checkout,paymentmethods,deliveryoptions,wishlist,reviews,promotions,account,addressbook,ordertracking,shopping_count,cart_count,engagement_count,account_count,other_screens,engagement_score,screen_diversity,purchase_intent,region_asia pacific,region_europe,region_latin america,region_middle east,region_north america,channel_email,channel_organic search,channel_paid search,channel_referral,channel_social media,age_group_adult,age_group_senior,age_group_young,user_type_browser,user_type_buyer,user_type_power user,app_major_version,version_score
0,100000,66,0,5,10,1,1,0,1,0,1,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,3,0,1,1,1,3.85,3,1.1,False,True,False,False,False,False,False,False,True,False,False,True,False,False,True,False,2,2.08
1,100001,68,1,8,39,0,0,0,0,0,5,1,1,1,0,1,0,1,1,1,0,0,1,0,0,1,3,3,1,1,2,10.2,4,2.3,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,2,2.05
2,100002,25,1,9,19,1,0,1,0,0,4,0,1,0,0,0,1,1,1,1,1,1,0,0,1,0,1,4,2,1,1,6.85,4,2.4,False,True,False,False,False,False,False,False,False,True,True,False,False,True,False,False,2,2.06
3,100003,39,0,8,47,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,1,1,2,2,3,11.8,4,1.1,True,False,False,False,False,False,False,False,False,True,True,False,False,True,False,False,2,2.21
4,100004,28,1,9,29,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,1,0,1,0,1,2,2,1,2,3,8.8,4,1.7,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,2,2.28


In [41]:
df_processed.columns

Index(['user_id', 'age', 'platform', 'session_count', 'total_screens_viewed', 'used_search_feature', 'wrote_review', 'added_to_wishlist', 'purchase_24h', 'hour', 'dayofweek', 'is_weekend', 'productlist', 'productdetail', 'categorybrowse', 'search', 'shoppingcart', 'checkout', 'paymentmethods', 'deliveryoptions', 'wishlist', 'reviews', 'promotions', 'account', 'addressbook', 'ordertracking', 'shopping_count', 'cart_count', 'engagement_count', 'account_count', 'other_screens', 'engagement_score', 'screen_diversity', 'purchase_intent', 'region_asia pacific', 'region_europe', 'region_latin america', 'region_middle east', 'region_north america', 'channel_email', 'channel_organic search', 'channel_paid search', 'channel_referral', 'channel_social media', 'age_group_adult', 'age_group_senior', 'age_group_young', 'user_type_browser', 'user_type_buyer', 'user_type_power user', 'app_major_version', 'version_score'], dtype='object')

In [42]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 52 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 10000 non-null  int64  
 1   age                     10000 non-null  int64  
 2   platform                10000 non-null  int64  
 3   session_count           10000 non-null  int64  
 4   total_screens_viewed    10000 non-null  int64  
 5   used_search_feature     10000 non-null  int64  
 6   wrote_review            10000 non-null  int64  
 7   added_to_wishlist       10000 non-null  int64  
 8   purchase_24h            10000 non-null  int64  
 9   hour                    10000 non-null  int32  
 10  dayofweek               10000 non-null  int32  
 11  is_weekend              10000 non-null  int64  
 12  productlist             10000 non-null  int64  
 13  productdetail           10000 non-null  int64  
 14  categorybrowse          10000 non-null 

In [43]:
df_processed.shape

(10000, 52)