In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # type: ignore
import re
from collections import Counter

# Ensure folders exist
os.makedirs("data/processed", exist_ok=True)
os.makedirs("output", exist_ok=True)

# Path to raw scraped data
INPUT_CSV = "data/raw/indiamart_industrial_machinery.csv"  


In [None]:
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"File not found: {INPUT_CSV}")

df = pd.read_csv(INPUT_CSV)
df.head()


In [None]:
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype(str).str.strip()
        
df.info()


In [None]:
price_pattern = re.compile(r"(\d+(?:\.\d+)?)")

def parse_price(price):
    if pd.isna(price):
        return np.nan
    nums = price_pattern.findall(price.replace(",", ""))
    if nums:
        return float(nums[0])
    return np.nan

df["price_clean"] = df["price"].map(parse_price)
df[["price", "price_clean"]].head()


In [None]:
summary = {
    "total_records": len(df),
    "unique_products": df["product_name"].nunique(),
    "unique_companies": df["company"].nunique(),
    "records_with_price": df["price_clean"].notna().sum()
}

summary


In [None]:
plt.figure(figsize=(8,5))
df["price_clean"].dropna().hist(bins=40)
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Count")
plt.savefig("output/price_distribution.png")
plt.show()


In [None]:
top_companies = df["company"].value_counts().head(10)
top_companies.plot(kind="barh", figsize=(8,5))
plt.title("Top 10 Companies by Listing Count")
plt.xlabel("Listings")
plt.savefig("output/top_companies.png")
plt.show()


In [None]:
tokens = []

for text in df["product_name"].dropna():
    words = re.findall(r"[A-Za-z0-9]+", text.lower())
    tokens.extend(words)

Counter(tokens).most_common(20)


In [None]:
df.to_csv("data/processed/indiamart_cleaned.csv", index=False)
print("Saved cleaned dataset â†’ data/processed/indiamart_cleaned.csv")
