## Importing the libraries

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd

## Data Preprocessing

In [17]:
users = pd.read_csv('Fake_Data/user_data.csv')
products = pd.read_csv('Fake_Data/product_data.csv')
behavior = pd.read_csv('Fake_Data/user_behavior_data.csv')
ratings = pd.read_csv('Fake_Data/user_ratings.csv')

In [18]:
# Find the missing values:
users.isnull().sum()
products.isnull().sum()
ratings.isnull().sum()
behavior.isnull().sum()     # Missing values in the 'purchase_timestamp' column

user_id                  0
product_id               0
view_timestamp           0
purchase_timestamp    3502
dtype: int64

In [19]:
# Adding a new column for indicating purchases 
behavior["purchase_made"] = behavior["purchase_timestamp"].notnull().astype(int)
behavior['purchase_timestamp'].fillna(value='No Purchase', inplace=True)
behavior["pur"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  behavior['purchase_timestamp'].fillna(value='No Purchase', inplace=True)


Unnamed: 0,user_id,product_id,view_timestamp,purchase_timestamp,purchase_made
0,1493,2130,2023-12-15 22:54:36,No Purchase,0
1,1634,2871,2023-11-05 02:02:47,No Purchase,0
2,1375,2582,2023-12-18 13:43:28,No Purchase,0
3,1099,2594,2023-12-05 22:24:09,2023-12-06 12:40:09,1
4,1192,2325,2023-09-23 05:58:36,2023-09-23 21:58:36,1
...,...,...,...,...,...
4995,1591,2216,2024-03-28 00:36:08,2024-03-28 11:36:08,1
4996,1356,2843,2024-04-30 02:22:18,No Purchase,0
4997,1579,2275,2024-03-03 05:16:54,No Purchase,0
4998,1451,2420,2024-07-09 13:41:01,No Purchase,0


In [None]:
# 
behavior = behavior.merge(products[['product_id', 'price', 'category']], on='product_id', how='left')
ratings = ratings.merge(products[['product_id', 'avg_rating']], on='product_id', how='left')


### Adding RFM (Recency, Frequency, Monetary) features for user behavior analysis.
- Recency: Number of days since the user's last purchase to the present.
- Frequency: Total number of purchases made by the user within a given period.
- Monetary: Total amount spent by the user based on their purchases.

In [22]:
# Convert 'purchase_timestamp' to datetime objects and calculate recency
behavior['purchase_date'] = pd.to_datetime(behavior['purchase_timestamp'], errors='coerce').dt.date
behavior['recency'] = behavior.groupby('user_id')['purchase_date'].transform(
    lambda x: (
            datetime.now().date() - pd.to_datetime(x).max()
    ).days
)

# Calculate the frequency of purchases for each user
behavior['frequency'] = behavior.groupby('user_id')['purchase_timestamp'].transform('count')

# Enrich with product prices and calculate the monetary value (total amount spent) for each user
behavior = behavior.merge(products[['product_id', 'price']], on='product_id', how='left')
behavior['monetary'] = behavior.groupby('user_id')['price'].transform('sum')


  behavior['purchase_date'] = pd.to_datetime(behavior['purchase_timestamp'], errors='coerce').dt.date


TypeError: unsupported operand type(s) for -: 'datetime.date' and 'Timestamp'