In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier

# Load dataset
file_path = "Dataset/processed_file.csv"
df = pd.read_csv(file_path)

# Selecting relevant features
X = df[['brand']].copy()

# Encoding categorical variables
X = pd.get_dummies(X, columns=['brand'])

# Adding binary encoded columns for notable effects and skin type
notable_effects_columns = [col for col in df.columns if col.startswith('notable_effects_')]
skin_type_columns = ['Sensitive', 'Combination', 'Oily', 'Dry', 'Normal']
X[notable_effects_columns + skin_type_columns] = df[notable_effects_columns + skin_type_columns]

# Scaling numerical features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Encoding product_type
y = LabelEncoder().fit_transform(df['product_type'].astype(str))

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training CatBoost Classifier
catboost_classifier = CatBoostClassifier(
    iterations=2000,  # Increase iterations for better learning
    depth=8,  # Try deeper trees
    learning_rate=0.05,  # Lower learning rate with more iterations
    l2_leaf_reg=3,  # Regularization to prevent overfitting
    loss_function='MultiClass',
    random_seed=42,
    eval_metric="Accuracy",
    verbose=200
)

catboost_classifier.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, use_best_model=True)

# Making predictions
y_pred = catboost_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'CatBoost Accuracy: {accuracy:.2f}')
print("Classification Report:")
print(classification_report(y_test, y_pred))


0:	learn: 1.5621552	test: 1.5573345	best: 1.5573345 (0)	total: 3.59ms	remaining: 3.58s
200:	learn: 0.8859542	test: 1.0636321	best: 1.0636321 (200)	total: 589ms	remaining: 2.34s
400:	learn: 0.7577501	test: 1.0421580	best: 1.0418915 (390)	total: 1.21s	remaining: 1.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.041891516
bestIteration = 390

Shrink model to first 391 iterations.
CatBoost Accuracy: 0.60
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.50      0.60        40
           1       0.41      0.39      0.40        41
           2       0.58      0.67      0.62        79
           3       0.88      0.83      0.85        42
           4       0.45      0.50      0.48        40

    accuracy                           0.60       242
   macro avg       0.61      0.58      0.59       242
weighted avg       0.61      0.60      0.60       242

