## Importing the libraries

In [1]:
import pandas as pd

## Data Preprocessing

In [2]:
users = pd.read_csv('../Fake_Data/user_data.csv')
products = pd.read_csv('../Fake_Data/product_data.csv')
behavior = pd.read_csv('../Fake_Data/user_behavior_data.csv')
ratings = pd.read_csv('../Fake_Data/user_ratings.csv')

In [3]:
# Find the missing values:
users.isnull().sum()
products.isnull().sum()
ratings.isnull().sum()
behavior.isnull().sum()     # Missing values in the 'purchase_timestamp' column

user_id                  0
product_id               0
view_timestamp           0
purchase_timestamp    3502
dtype: int64

In [4]:
# Combining data into a single csv file
combined_data = behavior.merge(users, on='user_id', how='outer')
combined_data = combined_data.merge(products, on='product_id', how='outer')
combined_data = combined_data.merge(ratings, on=['user_id', 'product_id'], how='outer')

combined_data.isnull().sum()    # Verify missing date

user_id                  5
product_id               5
view_timestamp        1998
purchase_timestamp    5500
age                   1993
gender                1993
location              1993
join_date             1993
category              1993
price                 1993
brand                 1993
avg_rating            1993
rating                4997
timestamp             4997
dtype: int64

# Filling Missing Values for Numeric Variables 

In [5]:
# Age column filling:
mean_age = round(combined_data['age'].mean())    # Calculate the mean of 'age' and round it to the nearest integer
combined_data['age'].fillna(mean_age, inplace=True)     # Fill missing 'age' values with the rounded mean and ensure the type is integer
combined_data['age'] = combined_data['age'].astype(int)     # Ensure the entire column is of integer type

# Price column filling:
combined_data['price'].fillna(
    combined_data['price'].median(), inplace=True
)

# avg_rating column filling:
combined_data['avg_rating'].fillna(
    combined_data['avg_rating'].median(), inplace=True
)

# Rating column filling:
combined_data['rating'].fillna(
    combined_data['rating'].median(), inplace=True
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['age'].fillna(mean_age, inplace=True)     # Fill missing 'age' values with the rounded mean and ensure the type is integer
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['price'].fillna(
The behavior will change in pandas 3.0. This inplace method will

# Filling Missing Values for Categorical Variables

In [6]:
# Gender column filling:
combined_data['gender'].fillna(combined_data['gender'].mode()[0], inplace=True)

# Location column filling:
# combined_data['location'].fillna('Unknown', inplace=True)
combined_data['location'].fillna(combined_data['location'].mode()[0], inplace=True)

# Category column filling:
# combined_data['category'].fillna('Unknown', inplace=True)
combined_data['category'].fillna(combined_data['category'].mode()[0], inplace=True)

# Brand column filling:
# combined_data['brand'].fillna('Unknown', inplace=True)
combined_data['brand'].fillna(combined_data['brand'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['gender'].fillna(combined_data['gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['location'].fillna(combined_data['location'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will nev

In [7]:
# Adding a new column for indicating purchases 
combined_data["purchase_made"] = combined_data["purchase_timestamp"].notnull().astype(int)
combined_data['purchase_timestamp'].fillna(value='No Purchase', inplace=True)
combined_data.iloc[:, 14:]     # 0 means user not buy product, 1 means user busy

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['purchase_timestamp'].fillna(value='No Purchase', inplace=True)


Unnamed: 0,purchase_made
0,0
1,0
2,1
3,0
4,0
...,...
6993,0
6994,0
6995,0
6996,0


In [8]:
final_data = combined_data[
    ['user_id', 'product_id', 'location', 'category', 'brand', 'gender', 'price', 'age', 'avg_rating', 'rating', 'purchase_made']
]
final_data.dropna(inplace=True)
final_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.dropna(inplace=True)


Unnamed: 0,user_id,product_id,location,category,brand,gender,price,age,avg_rating,rating,purchase_made
0,1001.0,2011.0,East Lawrencefurt,Electronics,Chapman Ltd,M,709.14,56,4.7,3.0,0
1,1001.0,2062.0,West Deborah,Clothing,"Anderson, Soto and Dixon",M,503.12,44,2.9,1.0,0
2,1001.0,2358.0,East Lawrencefurt,Clothing,"Mccormick, Boone and Hubbard",M,612.73,56,4.2,3.0,1
3,1001.0,2553.0,East Lawrencefurt,Sports,Duarte PLC,M,981.08,56,1.5,3.0,0
4,1001.0,2555.0,East Lawrencefurt,Automotive,Pruitt Group,M,864.23,56,4.3,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...
6988,2000.0,2455.0,Jonesport,Toys,White LLC,M,603.07,55,4.9,3.0,1
6989,2000.0,2498.0,Jonesport,Sports,Wilson-Pacheco,M,604.93,55,2.0,3.0,1
6990,2000.0,2529.0,Jonesport,Clothing,Lee Inc,M,880.91,55,4.2,3.0,0
6991,2000.0,2689.0,Jonesport,Electronics,Hess-Mullins,M,892.41,55,1.4,3.0,0


In [9]:
final_data.isnull().sum()   # Verify missing date

user_id          0
product_id       0
location         0
category         0
brand            0
gender           0
price            0
age              0
avg_rating       0
rating           0
purchase_made    0
dtype: int64

In [10]:
final_data.to_csv(
    path_or_buf='../data/processed/final_data.csv', index=False
)