In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
# loading data
try:
    data_path = '../data/train.csv'
    print(f"Attempting to load data from: {data_path}")

    # --- Load the data ---
    df = pd.read_csv(data_path)

    # --- ESSENTIAL: Perform initial checks ---
    print("\n✅ Data loaded successfully!")
    print(f"Shape of the data (rows, columns): {df.shape}")

    print("\n--- First 5 Rows ---")
    display(df.head())

    print("\n--- Data Info (Column Names, Non-Null Counts, Data Types) ---")
    df.info()

except FileNotFoundError:
    print(f"❌ Error: The file was not found at the path: {data_path}")
    print("Please check that the file exists and the path is correct.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")


Attempting to load data from: ../data/train.csv

✅ Data loaded successfully!
Shape of the data (rows, columns): (50000, 4)

--- First 5 Rows ---


Unnamed: 0,sample_id,catalog_content,image_link,price
0,158784,"Item Name: Log Cabin Sugar Free Syrup, 24 FL O...",https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195
1,4095,Item Name: Raspberry Ginseng Oolong Tea (50 te...,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.54
2,172021,Item Name: Walden Farms Honey Dijon Dressing -...,https://m.media-amazon.com/images/I/71HGx42QmU...,17.86
3,268276,Item Name: Vlasic Ovals Hamburger Dill Pickle ...,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.94
4,154791,"Item Name: Amoretti Premium Syrup, Grand Orang...",https://m.media-amazon.com/images/I/61c+aTE6TY...,25.99



--- Data Info (Column Names, Non-Null Counts, Data Types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sample_id        50000 non-null  int64  
 1   catalog_content  50000 non-null  object 
 2   image_link       50000 non-null  object 
 3   price            50000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.5+ MB


In [35]:
df_cleaned=df.copy()
df_cleaned

Unnamed: 0,sample_id,catalog_content,image_link,price
0,158784,"Item Name: Log Cabin Sugar Free Syrup, 24 FL O...",https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195
1,4095,Item Name: Raspberry Ginseng Oolong Tea (50 te...,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540
2,172021,Item Name: Walden Farms Honey Dijon Dressing -...,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860
3,268276,Item Name: Vlasic Ovals Hamburger Dill Pickle ...,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.940
4,154791,"Item Name: Amoretti Premium Syrup, Grand Orang...",https://m.media-amazon.com/images/I/61c+aTE6TY...,25.990
...,...,...,...,...
49995,43460,"Item Name: Java Monster Coffee + Energy Drink,...",https://m.media-amazon.com/images/I/51jGEnLv8-...,29.520
49996,96214,Item Name: Gits Upma Mix 7 Oz by Gits\nValue: ...,https://m.media-amazon.com/images/I/41OmUcWbNB...,7.990
49997,196377,Item Name: Black Cohosh and Dong Quai Combinat...,https://m.media-amazon.com/images/I/811HtPUwUw...,48.560
49998,291807,Item Name: WOODSTOCK FARMS Organic Wheat Berri...,https://m.media-amazon.com/images/I/61w1+b+Unj...,103.240


In [36]:
df_cleaned.isnull().sum()

sample_id          0
catalog_content    0
image_link         0
price              0
dtype: int64

In [37]:
df_cleaned.describe()

Unnamed: 0,sample_id,price
count,50000.0,50000.0
mean,149631.39898,23.605707
std,86443.518122,34.401896
min,4.0,0.3
25%,74010.75,6.75
50%,150016.5,13.99
75%,224629.0,28.49
max,299428.0,2796.0


In [38]:
df_cleaned.shape

(50000, 4)

In [39]:
df['image_link']=df['image_link'].fillna('no_image')

In [40]:
## handling duplicates
df_cleaned = df_cleaned.drop_duplicates()

In [41]:
df_cleaned.shape

(50000, 4)

In [42]:
df_cleaned['catalog_content'] = df_cleaned['catalog_content'].str.strip().str.title()

In [43]:
# IMPROVED: This regex captures the first word and an optional second word.
df_cleaned['brand'] = df_cleaned['catalog_content'].str.extract(r'^(\w+(?:\s+\w+)?)', expand=False)

In [44]:
df_cleaned.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,brand
0,158784,"Item Name: Log Cabin Sugar Free Syrup, 24 Fl O...",https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,Item Name
1,4095,Item Name: Raspberry Ginseng Oolong Tea (50 Te...,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.54,Item Name
2,172021,Item Name: Walden Farms Honey Dijon Dressing -...,https://m.media-amazon.com/images/I/71HGx42QmU...,17.86,Item Name
3,268276,Item Name: Vlasic Ovals Hamburger Dill Pickle ...,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.94,Item Name
4,154791,"Item Name: Amoretti Premium Syrup, Grand Orang...",https://m.media-amazon.com/images/I/61c+aTE6TY...,25.99,Item Name


In [45]:
df_cleaned['pack_size'] = (
    pd.to_numeric(df_cleaned['catalog_content'].str.extract(r'Pack of (\d+)', expand=False), errors='coerce')
    .fillna(1)
    .astype(int)
)

In [47]:
df_cleaned

Unnamed: 0,sample_id,catalog_content,image_link,price,brand,pack_size
0,158784,"Item Name: Log Cabin Sugar Free Syrup, 24 Fl O...",https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,Item Name,1
1,4095,Item Name: Raspberry Ginseng Oolong Tea (50 Te...,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540,Item Name,1
2,172021,Item Name: Walden Farms Honey Dijon Dressing -...,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860,Item Name,1
3,268276,Item Name: Vlasic Ovals Hamburger Dill Pickle ...,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.940,Item Name,1
4,154791,"Item Name: Amoretti Premium Syrup, Grand Orang...",https://m.media-amazon.com/images/I/61c+aTE6TY...,25.990,Item Name,1
...,...,...,...,...,...,...
49995,43460,"Item Name: Java Monster Coffee + Energy Drink,...",https://m.media-amazon.com/images/I/51jGEnLv8-...,29.520,Item Name,1
49996,96214,Item Name: Gits Upma Mix 7 Oz By Gits\nValue: ...,https://m.media-amazon.com/images/I/41OmUcWbNB...,7.990,Item Name,1
49997,196377,Item Name: Black Cohosh And Dong Quai Combinat...,https://m.media-amazon.com/images/I/811HtPUwUw...,48.560,Item Name,1
49998,291807,Item Name: Woodstock Farms Organic Wheat Berri...,https://m.media-amazon.com/images/I/61w1+b+Unj...,103.240,Item Name,1


In [48]:
import re
df_cleaned['product_size'] = (
    df_cleaned['catalog_content']
    .str.extract(r'((\d+\.?\d*)\s*(Fl\.?\s*Oz|Oz|Gallon|L|ml|g))', re.IGNORECASE)[0]
)

In [49]:
df_cleaned

Unnamed: 0,sample_id,catalog_content,image_link,price,brand,pack_size,product_size
0,158784,"Item Name: Log Cabin Sugar Free Syrup, 24 Fl O...",https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,Item Name,1,24 Fl Oz
1,4095,Item Name: Raspberry Ginseng Oolong Tea (50 Te...,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540,Item Name,1,1 L
2,172021,Item Name: Walden Farms Honey Dijon Dressing -...,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860,Item Name,1,12 Oz
3,268276,Item Name: Vlasic Ovals Hamburger Dill Pickle ...,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.940,Item Name,1,16 Fl Oz
4,154791,"Item Name: Amoretti Premium Syrup, Grand Orang...",https://m.media-amazon.com/images/I/61c+aTE6TY...,25.990,Item Name,1,
...,...,...,...,...,...,...,...
49995,43460,"Item Name: Java Monster Coffee + Energy Drink,...",https://m.media-amazon.com/images/I/51jGEnLv8-...,29.520,Item Name,1,
49996,96214,Item Name: Gits Upma Mix 7 Oz By Gits\nValue: ...,https://m.media-amazon.com/images/I/41OmUcWbNB...,7.990,Item Name,1,7 Oz
49997,196377,Item Name: Black Cohosh And Dong Quai Combinat...,https://m.media-amazon.com/images/I/811HtPUwUw...,48.560,Item Name,1,4 Oz
49998,291807,Item Name: Woodstock Farms Organic Wheat Berri...,https://m.media-amazon.com/images/I/61w1+b+Unj...,103.240,Item Name,1,50 L
