## Importing the libraries

In [20]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Data Preprocessing

In [7]:
users = pd.read_csv('../Fake_Data/user_data.csv')
products = pd.read_csv('../Fake_Data/product_data.csv')
behavior = pd.read_csv('../Fake_Data/user_behavior_data.csv')
ratings = pd.read_csv('../Fake_Data/user_ratings.csv')

In [8]:
# Find the missing values:
users.isnull().sum()
products.isnull().sum()
ratings.isnull().sum()
behavior.isnull().sum()     # Missing values in the 'purchase_timestamp' column

user_id                  0
product_id               0
view_timestamp           0
purchase_timestamp    3502
dtype: int64

In [9]:
# Adding a new column for indicating purchases 
behavior["purchase_made"] = behavior["purchase_timestamp"].notnull().astype(int)
behavior['purchase_timestamp'].fillna(value='No Purchase', inplace=True)
behavior.iloc[:,3:]     # 0 means user not buy product, 1 means user busy

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  behavior['purchase_timestamp'].fillna(value='No Purchase', inplace=True)


Unnamed: 0,purchase_timestamp,purchase_made
0,No Purchase,0
1,No Purchase,0
2,No Purchase,0
3,2023-12-06 12:40:09,1
4,2023-09-23 21:58:36,1
...,...,...
4995,2024-03-28 11:36:08,1
4996,No Purchase,0
4997,No Purchase,0
4998,No Purchase,0


In [14]:
# Combining data into a single csv file
combined_data = behavior.merge(users, on='user_id')
combined_data = combined_data.merge(products, on='product_id')
combined_data = combined_data.merge(ratings, on=['user_id', 'product_id'])
combined_data.isnull().sum()    # Verify missing date

user_id               0
product_id            0
view_timestamp        0
purchase_timestamp    0
purchase_made         0
age                   0
gender                0
location              0
join_date             0
category              0
price                 0
brand                 0
avg_rating            0
rating                0
timestamp             0
dtype: int64

In [25]:
final_data = combined_data[
    ['user_id', 'product_id', 'age', 'gender', 'location', 'category', 'price', 'brand', 'avg_rating', 'rating', 'purchase_made']
]
# final_data = pd.get_dummies(
#     final_data, columns=['gender', 'location', 'category', 'brand']
# )
# 
# scaler = StandardScaler()
# final_data[['age', 'price', 'avg_rating', 'rating']] = scaler.fit_transform(final_data[['age', 'price', 'avg_rating', 'rating']])

final_data

Unnamed: 0,user_id,product_id,age,gender,location,category,price,brand,avg_rating,rating,purchase_made
0,1596,2276,67,M,Joneston,Toys,295.76,Pena Ltd,4.6,3,0
1,1655,2678,42,M,Nelsonside,Home & Kitchen,840.14,Wilson-Pacheco,3.5,1,0
2,1996,2100,60,F,Campbellstad,Beauty,466.61,Griffin-Chase,4.7,1,0
3,1575,2029,30,F,Bradyland,Health,412.86,"Mccormick, Boone and Hubbard",2.3,5,0
4,1044,2155,24,F,Nicholasstad,Electronics,658.5,Lee Inc,1.5,3,1
5,1619,2250,49,F,Lake Jeffreyhaven,Health,38.02,York-Barnett,1.1,4,0
6,1963,2762,64,M,New Andrea,Toys,963.68,"Edwards, Robinson and Shannon",4.7,2,0
7,1044,2155,24,F,Nicholasstad,Electronics,658.5,Lee Inc,1.5,3,0
8,1731,2257,66,F,South Timothyside,Sports,666.66,Boyd PLC,2.0,2,0
9,1408,2269,39,F,Matthewmouth,Home & Kitchen,115.17,"Mccormick, Boone and Hubbard",3.6,5,0


In [26]:
final_data.to_csv(
    path_or_buf='../data/processed/final_data.csv', index=False
)