In [1]:
# Import necessary libraries
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib



In [2]:
# Load the dataset
train_df = pd.read_parquet(r'C:\Users\aarek\OneDrive\문서\DCU\ETSY\data\data\2024\train')
test_df = pd.read_parquet(r'C:\Users\aarek\OneDrive\문서\DCU\ETSY\data\data\2024\test')



In [3]:
# Identify all potential categorical features
categorical_features = ['type', 'room', 'craft_type', 'recipient', 'material', 'occasion', 'holiday', 
                        'art_subject', 'style', 'shape', 'pattern', 'title', 'description', 'tags']  # Adjust if there are more categorical features



In [4]:
# Convert all potential categorical features to 'category' data type
for feature in categorical_features:
    train_df[feature] = train_df[feature].astype('category')
    test_df[feature] = test_df[feature].astype('category')



In [5]:
# Split data into features and target
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=['top_category_id', 'top_category_text', 
                                                                         'bottom_category_id', 'bottom_category_text', 
                                                                         'primary_color_id', 'primary_color_text', 
                                                                         'secondary_color_id', 'secondary_color_text', 
                                                                         'image/encoded']),
                                                  train_df['bottom_category_id'], test_size=0.2, random_state=42)



In [6]:
# Train the model with the specified categorical features
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=10,
    cat_features=categorical_features,
    eval_metric='TotalF1',
    verbose=100
)
model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=True)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: bad allocation

In [None]:
# Predict and evaluate the model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))



In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



In [None]:
# Save the model and predictions
joblib.dump(model, 'bottom_category_id_catboost_model.pkl')
predictions = model.predict(test_df.drop(columns=['image/encoded']))
test_df['predicted_bottom_category_id'] = predictions
test_df[['product_id', 'predicted_bottom_category_id']].to_csv('bottom_category_id_predictions.csv', index=False)

print("Model and predictions saved successfully!")
