In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("/amazon.csv")

# =============================
# DATA CLEANING
# =============================

# --- Clean price columns ---
def clean_price(x):
    if pd.isna(x):
        return np.nan
    return pd.to_numeric(x.replace("₹", "").replace(",", ""), errors="coerce")

df["discounted_price"] = df["discounted_price"].apply(clean_price)
df["actual_price"] = df["actual_price"].apply(clean_price)

# --- Clean discount percentage ---
df["discount_percentage"] = df["discount_percentage"].str.replace("%", "").astype(float)

# --- Clean rating ---
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# --- Clean rating_count ---
df["rating_count"] = df["rating_count"].str.replace(",", "").astype(float)

# --- Extract main category ---
df["main_category"] = df["category"].apply(lambda x: x.split("|")[-1] if isinstance(x, str) else x)

# --- Remove rows with missing essential values ---
df_clean = df.dropna(subset=["discounted_price", "actual_price", "rating", "rating_count"])

# Show cleaned data
print(df_clean.head())


   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category  discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             399.0   
1  Computers&Accessories|Accessories&Peripherals|...             199.0   
2  Computers&Accessories|Accessories&Peripherals|...             199.0   
3  Computers&Accessories|Accessories&Peripherals|...             329.0   
4  Computers&Accessories|Accessories&Peripherals|...             154.0   

   actual_price  discount_percentage  rating  rating_count  \
0        1099.0                 64.0     4.2       24269.0   
1         349.0       

In [None]:
df_clean.to_csv("amazon_clean.csv", index=False)

print("File berhasil dibuat: amazon_clean.csv")

File berhasil dibuat: amazon_clean.csv


In [None]:
from google.colab import files
files.download('/content/amazon_clean.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>