In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer  # CPU version
from sklearn.linear_model import LogisticRegression      # CPU version
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib

#############################################
# 1. Data Loading and Preprocessing
#############################################
def load_data():
    train_features = pd.read_csv('train-features.csv')
    train_labels = pd.read_csv('train-labels.csv')
    df = pd.merge(train_features, train_labels, on='walmart_id')
    print("Training data shape:", df.shape)
    return df

def preprocess_text(df):
    """
    Combines title, details_Manufacturer, and store into one text field.
    To boost manufacturer information (which is crucial for brand prediction),
    we duplicate details_Manufacturer twice.
    """
    df['text_features'] = (
        df['title'].fillna('') + ' ' +
        (df['details_Manufacturer'].fillna('') + ' ') * 2 +
        df['store'].fillna('')
    ).str.lower()
    return df

df = load_data()
df = preprocess_text(df) 

In [None]:
#############################################
# 2. TF‑IDF Vectorization
#############################################
# Use a moderate feature space to balance nuance with memory
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), 
                             stop_words='english', sublinear_tf=True)
X_sparse = vectorizer.fit_transform(df['text_features'])
# Convert sparse matrix to dense array and use float32 to save memory
X = X_sparse.toarray().astype(np.float32)
print("TF-IDF matrix shape:", X.shape)



In [None]:
#############################################
# 3. Label Encoding for All Targets
#############################################
# We target six outputs:
# - Main outputs: L0_category, L1_category, L2_category, details_Brand
# - Separate outputs: L3_category, L4_category
all_labels = ['L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category', 'details_Brand']
encoders = {}
for col in all_labels:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Create label arrays:
main_cols = ['L0_category', 'L1_category', 'L2_category', 'details_Brand']
y_main = np.column_stack([df[col] for col in main_cols])
y_L3 = df['L3_category'].values
y_L4 = df['L4_category'].values



In [None]:
#############################################
# 4. Train/Test Split
#############################################
# We use the same split for all models. Save indices for consistency.
indices = np.arange(X.shape[0])
X_train, X_val, idx_train, idx_val = train_test_split(X, indices, test_size=0.2, random_state=42)
y_main_train = np.column_stack([df[col].values[idx_train] for col in main_cols])
y_main_val = np.column_stack([df[col].values[idx_val] for col in main_cols])
y_L3_train = y_L3[idx_train]
y_L3_val = y_L3[idx_val]
y_L4_train = y_L4[idx_train]
y_L4_val = y_L4[idx_val]



In [None]:
#############################################
# 5. Train Main Multi‑Output Model for Main Outputs
#############################################
base_classifier = LogisticRegression(max_iter=1000, C=20.0)
model_main = MultiOutputClassifier(base_classifier)
model_main.fit(X_train, y_main_train)



In [None]:
#############################################
# 6. Train Separate Models for L3 and L4
#############################################
# We use a higher C value (e.g. 50.0) for L3 and L4 to give them extra flexibility.
model_L3 = LogisticRegression(max_iter=1000, C=50.0)
model_L3.fit(X_train, y_L3_train)

model_L4 = LogisticRegression(max_iter=1000, C=50.0)
model_L4.fit(X_train, y_L4_train)


In [None]:
#############################################
# 7. Evaluate the Models on the Validation Set
#############################################
print("\n--- Main Outputs Evaluation ---")
y_main_pred = model_main.predict(X_val)
for i, col in enumerate(main_cols):
    print(f"\nClassification Report for {col}:")
    print(classification_report(y_main_val[:, i], y_main_pred[:, i]))

print("\n--- L3 Evaluation ---")
y_L3_pred = model_L3.predict(X_val)
print(classification_report(y_L3_val, y_L3_pred))

print("\n--- L4 Evaluation ---")
y_L4_pred = model_L4.predict(X_val)
print(classification_report(y_L4_val, y_L4_pred))



In [None]:
#############################################
# 8. Save the Models and Vectorizer
#############################################
joblib.dump(model_main, 'product_classifier_main.joblib')
joblib.dump(model_L3, 'product_classifier_L3.joblib')
joblib.dump(model_L4, 'product_classifier_L4.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(encoders, 'label_encoders.joblib')
print("\nModels, vectorizer, and encoders saved.")



In [None]:
#############################################
# 9. Inference Example Function
#############################################
def predict_categories(text, vectorizer, model_main, model_L3, model_L4, encoders):
    X_new = vectorizer.transform([text]).toarray().astype(np.float32)
    preds_main = model_main.predict(X_new)
    pred_L3 = model_L3.predict(X_new)
    pred_L4 = model_L4.predict(X_new)
    results = {}
    for i, col in enumerate(main_cols):
        results[col] = encoders[col].inverse_transform([preds_main[0][i]])[0]
    results['L3_category'] = encoders['L3_category'].inverse_transform([pred_L3[0]])[0]
    results['L4_category'] = encoders['L4_category'].inverse_transform([pred_L4[0]])[0]
    return results

test_text = "Nike Air Max Running Shoes for Men"
predictions = predict_categories(test_text, vectorizer, model_main, model_L3, model_L4, encoders)
print("\nPredicted categories for:", test_text)
for cat, pred in predictions.items():
    print(f"{cat}: {pred}")



In [None]:
#############################################
# 10. Batch Prediction and Submission File Creation
#############################################
test_df = pd.read_csv("test-features.csv")
test_df["text_features"] = (
    test_df["title"].fillna("") + ' ' +
    (test_df["details_Manufacturer"].fillna("") + ' ') * 2 +
    test_df["store"].fillna("")
).str.lower()

X_test = vectorizer.transform(test_df["text_features"]).toarray().astype(np.float32)
preds_main = model_main.predict(X_test)
preds_L3 = model_L3.predict(X_test)
preds_L4 = model_L4.predict(X_test)

predictions_list = []
for i in range(X_test.shape[0]):
    sample_pred = {}
    for j, col in enumerate(main_cols):
        sample_pred[col] = encoders[col].inverse_transform([preds_main[i][j]])[0]
    sample_pred['L3_category'] = encoders['L3_category'].inverse_transform([preds_L3[i]])[0]
    sample_pred['L4_category'] = encoders['L4_category'].inverse_transform([preds_L4[i]])[0]
    predictions_list.append(sample_pred)

predictions_df = pd.DataFrame(predictions_list)
submission_df = pd.concat([test_df[["walmart_id"]], predictions_df], axis=1)
submission_df.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv.")

# Optionally, if using Google Colab, download the file:
from google.colab import files
files.download("submission.csv")