<a href="https://colab.research.google.com/github/741yagna/Amazon-ML-challenge-2k25/blob/main/ML_challange_best.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers xgboost scikit-learn --quiet
!pip install tensorflow --quiet

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import torch
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from transformers import DistilBertTokenizer, DistilBertModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input

warnings.filterwarnings("ignore")

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()

    def load_data(self, train_path, test_path):
        print("üìÇ Loading data...")
        train_df = pd.read_csv(train_path, engine="python", on_bad_lines="skip")
        test_df = pd.read_csv(test_path, engine="python", on_bad_lines="skip")
        print(f"Training data shape: {train_df.shape}")
        print(f"Test data shape: {test_df.shape}")

        # ‚úÖ Corrected column assertions
        assert 'catalog_content' in train_df.columns, "Missing 'catalog_content' column in train data"
        assert 'price' in train_df.columns, "Missing 'price' column in train data"

        return train_df, test_df

    def clean_text(self, text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        # Fix regex: escape - or place at start/end
        text = re.sub(r"[^\w\s\d.,\-+]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def preprocess_catalog_content(self, df):
        df["clean_text"] = df["catalog_content"].fillna("").apply(self.clean_text)
        df["text_length"] = df["clean_text"].str.len()
        df["word_count"] = df["clean_text"].str.split().str.len()
        return df

    def handle_missing_values(self, df):
        df["catalog_content"] = df["catalog_content"].fillna("")
        df["image_link"] = df.get("image_link", "").fillna("")
        return df

    def prepare_final_data(self, train_df, test_df):
        train_df = self.handle_missing_values(train_df)
        test_df = self.handle_missing_values(test_df)
        train_df = self.preprocess_catalog_content(train_df)
        test_df = self.preprocess_catalog_content(test_df)
        return train_df, test_df


In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import torch
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from transformers import DistilBertTokenizer, DistilBertModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tqdm import tqdm # Import tqdm

warnings.filterwarnings("ignore")

class TextFeatureExtractor:
    """Extracts text embeddings using DistilBERT (lighter and faster)"""

    def __init__(self, model_name='distilbert-base-uncased', batch_size=128):
        print(f" Initializing {model_name} model...")
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        self.batch_size = batch_size
        print(f"Model loaded on {self.device}\n")

    def transform(self, texts):
        """Convert text list into embeddings"""
        all_embeddings = []

        for i in tqdm(range(0, len(texts), self.batch_size), desc="üîπ Extracting text embeddings"):
            batch_texts = texts[i:i+self.batch_size]
            encoded = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**encoded)
                embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                all_embeddings.append(embeddings)

        all_embeddings = np.vstack(all_embeddings)
        print(f" Completed embeddings extraction. Shape: {all_embeddings.shape}\n")
        return all_embeddings

In [None]:
print("\n Loading ResNet50 for image feature extraction...")
cnn_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

def extract_image_features(img_path):
    try:
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        return cnn_model.predict(x).flatten()
    except:
        return np.zeros(2048)  # fallback if image not found



 Loading ResNet50 for image feature extraction...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 0us/step


In [None]:
class ModelTrainer:
    def __init__(self):
        self.model = XGBRegressor(
            n_estimators=600,
            learning_rate=0.03,
            max_depth=810,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            reg_alpha=0.3,
            random_state=42
        )

    def train(self, X_train, y_train):
        print("\n Training XGBoost model...")
        self.model.fit(X_train, y_train)
        print("Model training completed.")

    def predict(self, X):
        return self.model.predict(X)

In [None]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff) * 100

In [None]:
if __name__ == "__main__":
    import pandas as pd
    import re # Import re
    from sklearn.preprocessing import StandardScaler # Import StandardScaler
    class DataPreprocessor:
        def __init__(self):
            self.scaler = StandardScaler()

        def load_data(self, train_path, test_path):
            print("üìÇ Loading data...")
            train_df = pd.read_csv(train_path, engine="python", on_bad_lines="skip")
            test_df = pd.read_csv(test_path, engine="python", on_bad_lines="skip")
            print(f"Training data shape: {train_df.shape}")
            print(f"Test data shape: {test_df.shape}")

            # ‚úÖ Corrected column assertions
            assert 'catalog_content' in train_df.columns, "Missing 'catalog_content' column in train data"
            assert 'price' in train_df.columns, "Missing 'price' column in train data"

            return train_df, test_df

        def clean_text(self, text):
            if pd.isna(text):
                return ""
            text = str(text).lower()
            # Fix regex: escape - or place at start/end
            text = re.sub(r"[^\w\s\d.,\-+]", " ", text)
            text = re.sub(r"\s+", " ", text).strip()
            return text

        def preprocess_catalog_content(self, df):
            df["clean_text"] = df["catalog_content"].fillna("").apply(self.clean_text)
            df["text_length"] = df["clean_text"].str.len()
            df["word_count"] = df["clean_text"].str.split().str.len()
            return df

        def handle_missing_values(self, df):
            df["catalog_content"] = df["catalog_content"].fillna("")
            df["image_link"] = df.get("image_link", "").fillna("")
            return df

        def prepare_final_data(self, train_df, test_df):
            train_df = self.handle_missing_values(train_df)
            test_df = self.handle_missing_values(test_df)
            train_df = self.preprocess_catalog_content(train_df)
            test_df = self.preprocess_catalog_content(test_df)
            return train_df, test_df

In [None]:

    # File upload
    preprocessor = DataPreprocessor()
    train_df, test_df = preprocessor.load_data("/content/train.csv", "/content/test.csv")
    train_processed, test_processed = preprocessor.prepare_final_data(train_df, test_df)


üìÇ Loading data...
Training data shape: (75000, 4)
Test data shape: (75000, 3)


In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import torch
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from transformers import DistilBertTokenizer, DistilBertModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tqdm import tqdm # Import tqdm

warnings.filterwarnings("ignore")

class TextFeatureExtractor:
    """Extracts text embeddings using DistilBERT (lighter and faster)"""

    def __init__(self, model_name='distilbert-base-uncased', batch_size=128):
        print(f" Initializing {model_name} model...")
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        self.batch_size = batch_size
        print(f"Model loaded on {self.device}\n")

    def transform(self, texts):
        """Convert text list into embeddings"""
        all_embeddings = []

        for i in tqdm(range(0, len(texts), self.batch_size), desc="üîπ Extracting text embeddings"):
            batch_texts = texts[i:i+self.batch_size]
            encoded = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**encoded)
                embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                all_embeddings.append(embeddings)

        all_embeddings = np.vstack(all_embeddings)
        print(f" Completed embeddings extraction. Shape: {all_embeddings.shape}\n")
        return all_embeddings

In [None]:

    # Image embeddings
    print("\n Extracting image features (this may take time)...")

    print("\n Loading ResNet50 for image feature extraction...")
    cnn_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

    def extract_image_features(img_path):
        try:
            img = image.load_img(img_path, target_size=(224, 224))
            x = image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            return cnn_model.predict(x).flatten()
        except:
            return np.zeros(2048)  # fallback if image not found


 Extracting image features (this may take time)...

 Loading ResNet50 for image feature extraction...


In [None]:

    # Inspect the image_link column
    print("\n Inspecting 'image_link' column:")
    print("Train image_link data type:", train_processed["image_link"].dtype)
    print("Train image_link sample:", train_processed["image_link"].head())
    print("Test image_link data type:", test_processed["image_link"].dtype)
    print("Test image_link sample:", test_processed["image_link"].head())


    train_img_features = np.vstack(train_processed["image_link"].apply(extract_image_features))
    test_img_features = np.vstack(test_processed["image_link"].apply(extract_image_features))



 Inspecting 'image_link' column:
Train image_link data type: object
Train image_link sample: 0    https://m.media-amazon.com/images/I/51mo8htwTH...
1    https://m.media-amazon.com/images/I/71YtriIHAA...
2    https://m.media-amazon.com/images/I/51+PFEe-w-...
3    https://m.media-amazon.com/images/I/41mu0HAToD...
4    https://m.media-amazon.com/images/I/41sA037+Qv...
Name: image_link, dtype: object
Test image_link data type: object
Test image_link sample: 0    https://m.media-amazon.com/images/I/71hoAn78AW...
1    https://m.media-amazon.com/images/I/61ex8NHCIj...
2    https://m.media-amazon.com/images/I/61KCM61J8e...
3    https://m.media-amazon.com/images/I/51Ex6uOH7y...
4    https://m.media-amazon.com/images/I/71QYlrOMoS...
Name: image_link, dtype: object


In [None]:
  # 1Ô∏è‚É£ Extract Text Embeddings
text_extractor = TextFeatureExtractor()
train_embeddings = text_extractor.transform(train_processed["clean_text"].tolist())
test_embeddings = text_extractor.transform(test_processed["clean_text"].tolist())

# 2Ô∏è‚É£ Extract Image Embeddings
train_img_features = np.vstack(train_processed["image_link"].apply(extract_image_features))
test_img_features = np.vstack(test_processed["image_link"].apply(extract_image_features))

# ‚úÖ 3Ô∏è‚É£ Now do PCA reduction (after embeddings are ready)
print("\nüìâ Reducing dimensionality with PCA...")
from sklearn.decomposition import PCA

pca_text = PCA(n_components=100, random_state=42)
pca_img = PCA(n_components=100, random_state=42)

train_text_reduced = pca_text.fit_transform(train_embeddings)
test_text_reduced = pca_text.transform(test_embeddings)
train_img_reduced = pca_img.fit_transform(train_img_features)
test_img_reduced = pca_img.transform(test_img_features)

# 4Ô∏è‚É£ Scale Tabular Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_basic = scaler.fit_transform(train_processed[["text_length", "word_count"]])
test_basic = scaler.transform(test_processed[["text_length", "word_count"]])


 Initializing distilbert-base-uncased model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model loaded on cuda



üîπ Extracting text embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 586/586 [07:18<00:00,  1.33it/s]


 Completed embeddings extraction. Shape: (75000, 768)



üîπ Extracting text embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 586/586 [07:18<00:00,  1.34it/s]


 Completed embeddings extraction. Shape: (75000, 768)


üìâ Reducing dimensionality with PCA...


In [None]:
# ‚úÖ Combine all
X_train = np.hstack([train_basic, train_text_reduced, train_img_reduced])
X_test = np.hstack([test_basic, test_text_reduced, test_img_reduced])

In [None]:
class ModelTrainer:
    def __init__(self):
        self.model = XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )

    def train(self, X_train, y_train):
        print("\n Training XGBoost model...")
        self.model.fit(X_train, y_train)
        print("Model training completed.")

    def predict(self, X):
        return self.model.predict(X)


In [None]:
# ‚úÖ Log-transform target
y_train = train_processed["price"].values
y_train_log = np.log1p(y_train)

trainer = ModelTrainer()
trainer.train(X_train, y_train_log)


 Training XGBoost model...
Model training completed.


In [None]:
class ModelTrainer:
    def __init__(self):
        self.model = XGBRegressor(
            n_estimators=600,
            learning_rate=0.03,
            max_depth=810,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            reg_alpha=0.3,
            random_state=42
        )

    def train(self, X_train, y_train):
        print("\n Training XGBoost model...")
        self.model.fit(X_train, y_train)
        print("Model training completed.")

    def predict(self, X):
        return self.model.predict(X)

In [None]:
import numpy as np

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff) * 100


In [None]:
# Predictions
train_preds_log = trainer.predict(X_train)
train_preds = np.expm1(train_preds_log)

smape_train = smape(y_true=y_train, y_pred=train_preds)
print(f"\n SMAPE on Training Data: {smape_train:.2f}%")


 SMAPE on Training Data: 35.00%


In [None]:
import numpy as np
import pandas as pd

# ‚úÖ Predict on test data (using the trained model)
test_preds_log = trainer.predict(X_test)

# ‚úÖ Convert back from log scale to original prices
test_preds = np.expm1(test_preds_log)

# ‚úÖ Create a DataFrame for submission
submission = pd.DataFrame({
    "sample_id": test_processed["sample_id"],  # ensure 'sample_id' exists in test.csv
    "price": test_preds
})

# ‚úÖ Save predictions to CSV
output_path = "/content/test_predictions.csv"
submission.to_csv(output_path, index=False)

print(f"‚úÖ Predictions saved to {output_path}")
print(submission.head())


‚úÖ Predictions saved to /content/test_predictions.csv
   sample_id      price
0     100179  17.442541
1     245611  21.246830
2     146263  18.076269
3      95658  22.304014
4      36806  33.987297


In [None]:
from google.colab import files
files.download(output_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>