### Step 1.1 Imports 

In [None]:
# Importing necessary libraries
# Data manipulation
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
# Preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import math
from warnings import filterwarnings
filterwarnings('ignore')

### Step 1.2 Data Loading

In [None]:
df = pd.read_csv('shopping_behavior_updated.csv')
df.head()

### Step 2 Data Understaning

In [None]:
# Displaying datatypes and missing values
df.info()

In [None]:
# Displaying basic statistics
df.describe(include='all')

In [None]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

In [None]:
# Checking unique values in each column
df.nunique()

### Step 3 Target Engineering
#### Transforming the Target variable

In [None]:
# Converting (1-5 Star Ratings) into Low , Medium , High
def categorize_rating(rating):
    if rating <= 2:
        return 'Low'
    elif rating == 3:
        return 'Medium'
    else:
        return 'High'
df['Rating_Category'] = df['Review Rating'].apply(categorize_rating)

# Encode target numericallly for modeling
target_mapper = {'Low': 0, 'Medium': 1, 'High': 2}
df['Rating_label'] = df['Rating_Category'].map(target_mapper)
df[['Review Rating', 'Rating_Category', 'Rating_label']].head()


### Step 4 Exploratory Data Analysis

### 4.1 Target Variable distribution

In [None]:
# Showing distribution of the target variable
# Help identify class imbalance-critical for model fairness and performance
plt.figure(figsize=(8,6))
sns.countplot(x='Rating_Category', data=df, order=['Low', 'Medium', 'High'], palette='viridis')
plt.title('Distribution of Rating Categories')      
plt.show()

### 4.2 Numerical Variable distribution

In [None]:
# plotting histograms for numerical variables
# Shows skewness, normality and value spread
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols].hist(bins=15, figsize=(15, 10), layout=(3, 3))
plt.tight_layout()  
plt.suptitle('Numerical Variable Distributions', y=1.02)
plt.show()

### 4.3 Numerical Outliers 

In [None]:
# Plotting boxplots for numerical variables to identify outliers
# Identifies extreme values that may need scaling or winsorizing
plt.figure(figsize=(15, 10))    
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(f'Boxplot of {col}')

### 4.4 Categorical Feature Exploration

In [None]:
# Plotting countplots for categorical features
# Helps identify dominant categroies and feature imbalance
categorical_cols = df.select_dtypes(include=['object']).columns

num_cols = len(categorical_cols)
cols = 3
rows = math.ceil(num_cols / cols)

plt.figure(figsize=(15, rows * 4)) 

for i, col in enumerate(categorical_cols, 1):
    plt.subplot(rows, cols, i)
    sns.countplot(y=df[col], order=df[col].value_counts().index, palette='pastel')
    plt.title(f'Countplot of {col}')
plt.tight_layout()
plt.show()