In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Load the dataset
reviews = pd.read_csv("reviews.csv")
# Preview the data
reviews.head()
# Check dataset structure and data types
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB


In [2]:
# View original value counts
reviews['recommended'].value_counts()
# Create binary mapping
binary_dict = {True: 1, False: 0}
# Apply transformation
reviews['recommended'] = reviews['recommended'].map(binary_dict)

# Verify transformation
reviews['recommended'].value_counts()


recommended
1    4166
0     834
Name: count, dtype: int64

In [3]:
# View original rating distribution
reviews['rating'].value_counts()
# Create ordinal mapping for ratings
rating_dict = {
    "Hated it": 1,
    "Not great": 2,
    "Was okay": 3,
    "Liked it": 4,
    "Loved it": 5
}

# Apply transformation
reviews['rating'] = reviews['rating'].map(rating_dict)

# Verify transformation
reviews['rating'].value_counts()


rating
5    2798
4    1141
3     564
2     304
1     193
Name: count, dtype: int64

In [4]:
# Check category distribution
reviews['department_name'].value_counts()
# One-hot encode department_name
one_hot = pd.get_dummies(reviews['department_name'])

# Join encoded columns to original dataframe
reviews = reviews.join(one_hot)

# View updated column names
reviews.columns


Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating', 'Bottoms',
       'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
      dtype='object')

In [6]:
# Convert review_date to datetime
reviews['review_date'] = pd.to_datetime(reviews['review_date'])

# Verify data type
print(reviews['review_date'].dtype)


datetime64[ns]


In [7]:
# Select relevant numerical and encoded features
reviews = reviews[
    [
        'clothing_id',
        'age',
        'recommended',
        'rating',
        'Bottoms',
        'Dresses',
        'Intimate',
        'Jackets',
        'Tops',
        'Trend'
    ]
].copy()
# Set clothing_id as index
reviews = reviews.set_index('clothing_id')

reviews.head()

Unnamed: 0_level_0,age,recommended,rating,Bottoms,Dresses,Intimate,Jackets,Tops,Trend
clothing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1095,39,1,4,False,True,False,False,False,False
1095,28,1,5,False,True,False,False,False,False
699,37,1,5,False,False,True,False,False,False
1072,36,1,5,False,True,False,False,False,False
1094,32,1,5,False,True,False,False,False,False


In [8]:
# Initialize scaler
scaler = StandardScaler()

# Scale the dataset
scaled_reviews = scaler.fit_transform(reviews)

scaled_reviews


array([[-0.34814459,  0.44742824, -0.1896478 , ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-1.24475223,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-0.51116416,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       ...,
       [-0.59267395,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-1.24475223,  0.44742824,  0.71602461, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [ 1.68960003,  0.44742824,  0.71602461, ..., -0.21656679,
         1.12998541, -0.07504356]])