new methods 

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "Dataset/cleaned_dataset03.csv"  # Change this if your file is in a different location
df = pd.read_csv(file_path)

# Convert one-hot encoded 'product_type' columns into a single categorical variable
product_type_cols = [col for col in df.columns if col.startswith('product_type_')]
df['product_type'] = df[product_type_cols].idxmax(axis=1).str.replace('product_type_', '')
df = df.drop(columns=product_type_cols)

# Selecting features (X) and target (Y)
X = df[['brand', 'Sensitive', 'Combination', 'Oily', 'Dry', 'Normal'] + 
       [col for col in df.columns if col.startswith('notable_effects_')]]
Y = df['product_type']

# Encoding categorical variable 'brand'
X = pd.get_dummies(X, columns=['brand'], drop_first=True)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
nb_model = GaussianNB()
boost_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train models
rf_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)
boost_model.fit(X_train, y_train)

# Predictions
rf_preds = rf_model.predict(X_test)
nb_preds = nb_model.predict(X_test)
boost_preds = boost_model.predict(X_test)

# Evaluate models
rf_acc = accuracy_score(y_test, rf_preds)
nb_acc = accuracy_score(y_test, nb_preds)
boost_acc = accuracy_score(y_test, boost_preds)

print("Random Forest Accuracy:", rf_acc)
print(classification_report(y_test, rf_preds))

print("Naïve Bayes Accuracy:", nb_acc)
print(classification_report(y_test, nb_preds))

print("Gradient Boosting Accuracy:", boost_acc)
print(classification_report(y_test, boost_preds))


Random Forest Accuracy: 0.5661157024793388
              precision    recall  f1-score   support

   face wash       0.55      0.45      0.49        40
 moisturizer       0.37      0.32      0.34        41
       serum       0.64      0.71      0.67        79
   sunscreen       0.86      0.86      0.86        42
       toner       0.32      0.35      0.33        40

    accuracy                           0.57       242
   macro avg       0.55      0.54      0.54       242
weighted avg       0.56      0.57      0.56       242

Naïve Bayes Accuracy: 0.33884297520661155
              precision    recall  f1-score   support

   face wash       0.23      0.82      0.36        40
 moisturizer       0.25      0.07      0.11        41
       serum       0.58      0.27      0.37        79
   sunscreen       0.54      0.60      0.57        42
       toner       0.00      0.00      0.00        40

    accuracy                           0.34       242
   macro avg       0.32      0.35      0.28   

In [8]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 15, 20],  # Maximum tree depth
    'min_samples_split': [2, 4, 6],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
    'max_features': ['sqrt', 'log2'],  # Feature selection
}

# Define hyperparameter grid for Gradient Boosting
boost_param_grid = {
    'n_estimators': [100, 200, 300],  # More boosting iterations
    'learning_rate': [0.01, 0.05, 0.1],  # Step size
    'max_depth': [3, 5, 7],  # Tree depth
    'subsample': [0.8, 1.0],  # Reduce overfitting
}

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
boost_model = GradientBoostingClassifier(random_state=42)

# GridSearchCV for Random Forest
rf_grid = GridSearchCV(rf_model, rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# GridSearchCV for Gradient Boosting
boost_grid = GridSearchCV(boost_model, boost_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
boost_grid.fit(X_train, y_train)

# Best parameters & accuracy
print("Best Random Forest Parameters:", rf_grid.best_params_)
print("Best Random Forest Accuracy:", rf_grid.best_score_)

print("Best Gradient Boosting Parameters:", boost_grid.best_params_)
print("Best Gradient Boosting Accuracy:", boost_grid.best_score_)

# Train models with best parameters
rf_best = RandomForestClassifier(**rf_grid.best_params_, random_state=42)
boost_best = GradientBoostingClassifier(**boost_grid.best_params_, random_state=42)

rf_best.fit(X_train, y_train)
boost_best.fit(X_train, y_train)

# Predictions
rf_preds = rf_best.predict(X_test)
boost_preds = boost_best.predict(X_test)

# Evaluation
print("Tuned Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

print("Tuned Gradient Boosting Accuracy:", accuracy_score(y_test, boost_preds))
print(classification_report(y_test, boost_preds))


  _data = np.array(data, dtype=dtype, copy=copy,


Best Random Forest Parameters: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 300}
Best Random Forest Accuracy: 0.5537180547276118
Best Gradient Boosting Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best Gradient Boosting Accuracy: 0.5454396861719516
Tuned Random Forest Accuracy: 0.6074380165289256
              precision    recall  f1-score   support

   face wash       0.64      0.40      0.49        40
 moisturizer       0.53      0.41      0.47        41
       serum       0.60      0.77      0.67        79
   sunscreen       0.84      0.86      0.85        42
       toner       0.42      0.42      0.42        40

    accuracy                           0.61       242
   macro avg       0.61      0.57      0.58       242
weighted avg       0.61      0.61      0.60       242

Tuned Gradient Boosting Accuracy: 0.5702479338842975
              precision    recall  f1-score   support

