In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from cuml.feature_extraction.text import TfidfVectorizer  # GPU-accelerated TF-IDF
from cuml.linear_model import LogisticRegression  # GPU-accelerated Logistic Regression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib
import cudf

# 📂 Load the datasets
def load_data():
    """Loads training features and labels and merges them."""
    train_data = pd.read_csv('train-features.csv')
    train_labels = pd.read_csv('train-labels.csv')

    # Merge the datasets on walmart_id
    train_df = train_data.merge(train_labels, on='walmart_id')
    print("Training data shape:", train_df.shape)
    return train_df

# 🔍 Preprocess the text data
def preprocess_text(df):
    """Combines title and manufacturer details into a single text column."""
    df['text_features'] = (
        df['title'].fillna('') + ' ' +
        df['details_Manufacturer'].fillna('')
    )
    return df

# 🏷️ Create label encoders
def create_label_encoders(df):
    """Encodes categorical labels into numerical values."""
    label_columns = ['L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']
    encoders = {}
    encoded_labels = []

    for col in label_columns:
        le = LabelEncoder()
        encoded_labels.append(le.fit_transform(df[col]))
        encoders[col] = le

    # Stack the encoded labels
    y = np.column_stack(encoded_labels)
    return y, encoders

# 🚀 Train the model
def train_model(X, y):
    """Trains a multi-output logistic regression model."""
    base_classifier = LogisticRegression(max_iter=1000)
    model = MultiOutputClassifier(base_classifier)
    model.fit(X, y)
    return model

# 🛠️ Function to make predictions
def predict_categories(text, vectorizer, model, encoders):
    """Predicts product categories for a given text description."""
    X = vectorizer.transform(cudf.Series([text]))
    X = X.get()
    predictions = model.predict(X)

    results = {}
    for i, (col, encoder) in enumerate(encoders.items()):
        predicted_class = encoder.inverse_transform([predictions[0][i]])[0]
        results[col] = predicted_class

    return results

# 📌 Main execution
def main():
    print("\n📂 Loading data...")
    df = load_data()

    print("\n🔍 Preprocessing text...")
    df = preprocess_text(df)

    print("\n📊 Creating TF-IDF vectors...")
    vectorizer = TfidfVectorizer(max_features=5000)  # GPU-accelerated
    X = vectorizer.fit_transform(df['text_features'])
    X = X.get()

    print("\n🏷️ Encoding labels...")
    y, encoders = create_label_encoders(df)

    print("\n✂️ Splitting data...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n🚀 Training model...")
    model = train_model(X_train, y_train)

    print("\n📈 Evaluating model...")
    y_pred = model.predict(X_val)
    label_columns = ['L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']
    for i, col in enumerate(label_columns):
        print(f"\n🔹 Classification Report for {col}:")
        print(classification_report(y_val[:, i], y_pred[:, i]))

    print("\n💾 Saving model and vectorizer...")
    joblib.dump(model, 'product_classifier.joblib')
    joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
    joblib.dump(encoders, 'label_encoders.joblib')

    print("\n🔮 Example prediction:")
    test_text = "Nike Air Max Running Shoes for Men"
    predictions = predict_categories(test_text, vectorizer, model, encoders)
    print("\n🎯 Predicted categories for:", test_text)
    for category, prediction in predictions.items():
        print(f"{category}: {prediction}")

if __name__ == "__main__":
    main()



📂 Loading data...
Training data shape: (200000, 10)

🔍 Preprocessing text...

📊 Creating TF-IDF vectors...

🏷️ Encoding labels...

✂️ Splitting data...

🚀 Training model...

📈 Evaluating model...

🔹 Classification Report for L0_category:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1663
           1       0.97      0.99      0.98     13258
           2       0.97      0.78      0.86       241
           3       0.95      0.95      0.95       998
           4       1.00      0.75      0.86         8
           5       0.00      0.00      0.00         3
           6       0.97      0.96      0.96      3931
           7       0.98      0.74      0.84       158
           8       0.89      0.86      0.87       769
           9       0.94      0.97      0.96      6100
          10       0.00      0.00      0.00         3
          11       0.92      0.88      0.90       548
          12       0.00      0.00      0.00         2
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.82      0.92      0.87       139
           2       0.00      0.00      0.00         2
           3       0.97      0.81      0.88        37
           4       0.85      0.90      0.87       710
           5       0.92      0.97      0.94       134
           6       0.86      0.75      0.80        32
           7       0.83      0.77      0.80        13
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         6
          10       0.47      0.78      0.58        18
          11       1.00      0.80      0.89        15
          12       0.00      0.00      0.00         4
          13       1.00      0.20      0.33        10
          14       1.00      0.50      0.67         2
          15       0.96      0.48      0.64        54
          16       0.00      0.00      0.00         2
          17       0.00    

In [None]:
# 📂 Load the test data
test_df = pd.read_csv("walmart-extremeclassification/test-features.csv")

# 💾 Load the trained model and vectorizer
model = joblib.load("product_classifier.joblib")
vectorizer = joblib.load("tfidf_vectorizer.joblib")
encoders = joblib.load("label_encoders.joblib")

# 🔮 Batch prediction function
def batch_predict(test_texts, vectorizer, model, encoders, batch_size=20000):
    """Predicts categories for a batch of product descriptions."""

    # Convert text data into cuDF Series for GPU acceleration
    text_series = cudf.Series(test_texts)

    # Transform using TF-IDF vectorizer
    X = vectorizer.transform(text_series)

    # Convert X to NumPy array explicitly
    X = X.get()

    # Make batch predictions
    predictions = model.predict(X)

    # Decode predictions
    results = []
    for pred_row in predictions:
        decoded_row = {col: encoders[col].inverse_transform([pred_row[i]])[0] for i, col in enumerate(encoders.keys())}
        results.append(decoded_row)

    return results

# 📝 Prepare text features for vectorization
test_df["text_features"] = (
    test_df["title"].fillna("") + " " + test_df["details_Manufacturer"].fillna("")
)

# 🚀 Predict in batch
predictions = batch_predict(test_df["text_features"].tolist(), vectorizer, model, encoders)

# 📊 Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# 🏷️ Add necessary columns
submission_df = pd.concat([test_df[["walmart_id", "details_Manufacturer"]], predictions_df], axis=1)
submission_df.rename(columns={"details_Manufacturer": "details_Brand"}, inplace=True)

# 💾 Save to CSV
submission_df.to_csv("/kaggle/working/sample_submission.csv", index=False)
print("✅ Sample submission saved at /kaggle/working/sample_submission.csv")

✅ Sample submission saved at /kaggle/working/sample_submission.csv
