In [6]:
# Load required libraries
import pandas as pd

# Assuming the DataFrame 'df' is loaded from the previous step
df = pd.read_csv('shopping_behavior_updated.csv')

# Check for and handle missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Based on the info output, no missing values are present.
if missing_values.sum() == 0:
    print("\nNo missing values found. Data is clean.")

Missing values per column:
Series([], dtype: int64)

No missing values found. Data is clean.


In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Select continuous numerical columns
continuous_cols = ['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases']

print("--- Skewness and Kurtosis BEFORE Transformation ---")
for col in continuous_cols:
    print(f"\n{col}:")
    print(f"  Skewness: {df[col].skew():.4f}")
    print(f"  Kurtosis: {df[col].kurtosis():.4f}")
    
    

print("\nSaved distribution plots BEFORE transformation.")

--- Skewness and Kurtosis BEFORE Transformation ---

Age:
  Skewness: -0.0064
  Kurtosis: -1.1951

Purchase Amount (USD):
  Skewness: 0.0127
  Kurtosis: -1.2366

Review Rating:
  Skewness: 0.0045
  Kurtosis: -1.1796

Previous Purchases:
  Skewness: 0.0031
  Kurtosis: -1.1902

Saved distribution plots BEFORE transformation.


In [8]:
# Identify categorical columns (object dtype)
# We use df.select_dtypes(include='object') to grab all categorical columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()

print(f"Columns to be One-Hot Encoded: {categorical_cols}")

# Apply One-Hot Encoding and drop the first category level for each feature
# This helps prevent multicollinearity (the Dummy Variable Trap).
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nDataFrame shape BEFORE encoding:", df.shape)
print("DataFrame shape AFTER encoding:", df_encoded.shape)
print("\nFirst 5 rows of encoded data (showing new columns):")
df_encoded

# The resulting DataFrame 'df_encoded' is ready for further processing or modeling.

Columns to be One-Hot Encoded: ['Gender', 'Item Purchased', 'Category', 'Location', 'Size', 'Color', 'Season', 'Subscription Status', 'Shipping Type', 'Discount Applied', 'Promo Code Used', 'Payment Method', 'Frequency of Purchases']

DataFrame shape BEFORE encoding: (3900, 18)
DataFrame shape AFTER encoding: (3900, 131)

First 5 rows of encoded data (showing new columns):


Unnamed: 0,Customer ID,Age,Purchase Amount (USD),Review Rating,Previous Purchases,Gender_Male,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,...,Payment Method_Credit Card,Payment Method_Debit Card,Payment Method_PayPal,Payment Method_Venmo,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,1,55,53,3.1,14,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
1,2,19,64,3.1,2,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,3,50,73,3.1,23,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,4,21,90,3.5,49,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4,5,45,49,2.7,31,True,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,3896,40,28,4.2,32,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
3896,3897,52,49,4.5,41,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3897,3898,46,33,2.9,24,False,True,False,False,False,...,False,False,False,True,False,False,False,False,True,False
3898,3899,44,77,3.8,24,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [9]:
from sklearn.preprocessing import PowerTransformer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the DataFrame 'df' is loaded and the columns exist

# --- Data Transformation (Yeo-Johnson) ---

# Identify columns to transform (based on typically skewed features)
cols_to_transform = ['Purchase Amount (USD)', 'Previous Purchases']

# Initialize the Power Transformer with the Yeo-Johnson method
# We use standardize=False to only transform the distribution shape without scaling it to zero mean/unit variance.
pt = PowerTransformer(method='yeo-johnson', standardize=False)

print("Applying Yeo-Johnson Transformation...")

for col in cols_to_transform:
    # The transformer expects a 2D array, so we reshape the column data
    data_to_transform = df[[col]]
    
    # Apply transformation and create a new column with the suffix '_YJ'
    df[f'{col}_YJ'] = pt.fit_transform(data_to_transform)
    
print("Yeo-Johnson Transformation applied to: Purchase Amount (USD) and Previous Purchases.")

# --- Post-Transformation Analysis ---

# Select transformed numerical columns
transformed_cols = ['Purchase Amount (USD)_YJ', 'Previous Purchases_YJ']

print("\n--- Skewness and Kurtosis AFTER Yeo-Johnson Transformation ---")
for col in transformed_cols:
    original_col = col.replace('_YJ', '')
    print(f"\n{original_col} (Transformed):")
    print(f"  Skewness: {df[col].skew():.4f}")
    print(f"  Kurtosis: {df[col].kurtosis():.4f}")
    
  

Applying Yeo-Johnson Transformation...
Yeo-Johnson Transformation applied to: Purchase Amount (USD) and Previous Purchases.

--- Skewness and Kurtosis AFTER Yeo-Johnson Transformation ---

Purchase Amount (USD) (Transformed):
  Skewness: -0.1295
  Kurtosis: -1.1986

Previous Purchases (Transformed):
  Skewness: -0.2343
  Kurtosis: -1.0708
