In [15]:
import pandas as pd

# Load dataset
# change the path if needed
df = pd.read_csv("D:/University/5th Semester/Machine Learning/Project/makeup.csv")

# Basic info
print("Shape of dataset:", df.shape)
print("\nColumn names:\n", df.columns)

# Show first 5 rows
print("\nFirst 5 rows:")
print(df.head())

# Check data types
print("\nData types:")
print(df.dtypes)


Shape of dataset: (931, 9)

Column names:
 Index(['Unnamed: 0', 'id', 'brand', 'name', 'price', 'currency', 'category',
       'created_at', 'description'],
      dtype='object')

First 5 rows:
   Unnamed: 0    id      brand                  name  price currency  \
0           0  1048  colourpop         Lippie Pencil    5.0      CAD   
1           1  1047  colourpop           Blotted Lip    5.5      CAD   
2           2  1046  colourpop           Lippie Stix    5.5      CAD   
3           3  1045  colourpop  No Filter Foundation   12.0      CAD   
4           4  1044      boosh              Lipstick   26.0      CAD   

   category                created_at  \
0    pencil  2018-07-08T23:45:08.056Z   
1  lipstick  2018-07-08T22:01:20.178Z   
2  lipstick  2018-07-08T21:47:49.858Z   
3    liquid  2018-07-08T18:22:25.273Z   
4  lipstick  2018-07-08T17:32:28.088Z   

                                         description  
0  Lippie Pencil A long-wearing and high-intensit...  
1  Blotted Lip S

In [16]:
import numpy as np

# Make a copy
data = df.copy()

# Drop non numeric or useless columns
# adjust names if your dataset uses different ones
drop_cols = ['product_name', 'brand', 'category', 'description']
for col in drop_cols:
    if col in data.columns:
        data.drop(col, axis=1, inplace=True)

# Handle missing values
data.fillna(data.mean(numeric_only=True), inplace=True)

# Create price classes
# Low   -> bottom 33%
# Medium-> middle 33%
# High  -> top 33%

data['price_class'] = pd.qcut(
    data['price'],
    q=3,
    labels=[0, 1, 2]
)

# Drop original price column
data.drop('price', axis=1, inplace=True)

# Check result
print("New shape:", data.shape)
print("\nClass distribution:")
print(data['price_class'].value_counts())
print("\nFirst 5 rows:")
print(data.head())


New shape: (931, 6)

Class distribution:
price_class
0    328
1    306
2    297
Name: count, dtype: int64

First 5 rows:
   Unnamed: 0    id                  name currency                created_at  \
0           0  1048         Lippie Pencil      CAD  2018-07-08T23:45:08.056Z   
1           1  1047           Blotted Lip      CAD  2018-07-08T22:01:20.178Z   
2           2  1046           Lippie Stix      CAD  2018-07-08T21:47:49.858Z   
3           3  1045  No Filter Foundation      CAD  2018-07-08T18:22:25.273Z   
4           4  1044              Lipstick      CAD  2018-07-08T17:32:28.088Z   

  price_class  
0           0  
1           0  
2           0  
3           1  
4           2  


In [17]:
from sklearn.model_selection import train_test_split

# Drop non numeric columns explicitly
non_numeric_cols = ['name', 'currency', 'created_at']
for col in non_numeric_cols:
    if col in data.columns:
        data.drop(col, axis=1, inplace=True)

# Drop index like columns if present
for col in ['id', 'Unnamed: 0']:
    if col in data.columns:
        data.drop(col, axis=1, inplace=True)

# Separate features and target
X = data.drop('price_class', axis=1)
y = data['price_class']

print("Final feature columns:")
print(X.columns)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain size:", X_train.shape)
print("Test size:", X_test.shape)


Final feature columns:
Index([], dtype='object')

Train size: (744, 0)
Test size: (187, 0)
