In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

# Load your dataset
file_path = "Dataset/processed_file.csv"  
df = pd.read_csv(file_path)



# Extract relevant columns
df_filtered = df[['notable_effects', 'skintype', 'brand', 'product_type']].copy()

# Convert string lists to actual lists
df_filtered['notable_effects'] = df_filtered['notable_effects'].apply(eval)
df_filtered['skintype'] = df_filtered['skintype'].apply(eval)
df_filtered['product_type'] = df_filtered['product_type'].apply(eval)

# Convert categorical variables into binary format
mlb_effects = MultiLabelBinarizer()
mlb_skin = MultiLabelBinarizer()
mlb_brand = MultiLabelBinarizer()
mlb_product_type = MultiLabelBinarizer()

X_effects = mlb_effects.fit_transform(df_filtered['notable_effects'])
X_skin = mlb_skin.fit_transform(df_filtered['skintype'])
X_brand = mlb_brand.fit_transform(df_filtered['brand'])
y_product_type = mlb_product_type.fit_transform(df_filtered['product_type'])

# Combine features
X = pd.concat([
    pd.DataFrame(X_effects, columns=mlb_effects.classes_),
    pd.DataFrame(X_skin, columns=mlb_skin.classes_),
    pd.DataFrame(X_brand, columns=mlb_brand.classes_)
], axis=1)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_product_type, test_size=0.2, random_state=42)

# Define XGBoost classifier
optimized_xgb = MultiOutputClassifier(
    XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=50,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8
    )
)

# Train model
optimized_xgb.fit(X_train, y_train)

# Evaluate model
accuracy = optimized_xgb.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2%}")

# Function to predict product type
def predict_product_type(notable_effects, skintype, brand):
    input_data = {
        "notable_effects": [notable_effects],
        "skintype": [skintype],
        "brand": [brand]
    }
    
    input_df = pd.DataFrame(input_data)
    input_df['notable_effects'] = input_df['notable_effects'].apply(eval)
    input_df['skintype'] = input_df['skintype'].apply(eval)
    
    X_input_effects = mlb_effects.transform(input_df['notable_effects'])
    X_input_skin = mlb_skin.transform(input_df['skintype'])
    X_input_brand = mlb_brand.transform(input_df['brand'])
    
    X_input = pd.concat([
        pd.DataFrame(X_input_effects, columns=mlb_effects.classes_),
        pd.DataFrame(X_input_skin, columns=mlb_skin.classes_),
        pd.DataFrame(X_input_brand, columns=mlb_brand.classes_)
    ], axis=1)
    
    predicted = optimized_xgb.predict(X_input)
    predicted_product_types = mlb_product_type.inverse_transform(predicted)
    
    return predicted_product_types

# Example usage
example_effects = ["hydrating", "soothing"]
example_skin = ["dry"]
example_brand = ["BrandX"]

print("Predicted Product Type:", predict_product_type(example_effects, example_skin, example_brand))



Model Accuracy: 40.50%


TypeError: eval() arg 1 must be a string, bytes or code object

Rain Forest

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load your dataset
file_path = "Dataset/processed_file.csv"  
df = pd.read_csv(file_path)

# Extract relevant columns
df_filtered = df[['notable_effects', 'skintype', 'brand', 'product_type']].copy()

# Convert string lists to actual lists
df_filtered['notable_effects'] = df_filtered['notable_effects'].apply(eval)
df_filtered['skintype'] = df_filtered['skintype'].apply(eval)
df_filtered['product_type'] = df_filtered['product_type'].apply(eval)

# Encode 'product_type' as classification labels
le_product_type = LabelEncoder()
df_filtered['product_type'] = le_product_type.fit_transform(df_filtered['product_type'].astype(str))

# Feature Engineering
all_effects = list(set(effect for sublist in df_filtered['notable_effects'] for effect in sublist))
all_skin_types = list(set(st for sublist in df_filtered['skintype'] for st in sublist))

def encode_multilabel(column, all_labels):
    return df_filtered[column].apply(lambda x: [1 if label in x else 0 for label in all_labels])

# Encode 'notable_effects' and 'skintype' as numerical features
df_encoded = pd.DataFrame(encode_multilabel('notable_effects', all_effects).tolist(), columns=all_effects)
df_encoded = pd.concat([df_encoded, pd.DataFrame(encode_multilabel('skintype', all_skin_types).tolist(), columns=all_skin_types)], axis=1)

# Target Encoding for 'brand'
te = TargetEncoder()
df_encoded['brand'] = te.fit_transform(df_filtered['brand'], df_filtered['product_type'])

# Define features and target
X = df_encoded
y = df_filtered['product_type']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# XGBoost model with optimized parameters
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="mlogloss"
)

# Train model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized XGBoost Model Accuracy: {accuracy:.2%}")

# Function to predict product type
def predict_product_type(notable_effects, skintype, brand):
    input_data = {}
    for effect in all_effects:
        input_data[effect] = 1 if effect in notable_effects else 0
    for skin in all_skin_types:
        input_data[skin] = 1 if skin in skintype else 0
    input_data['brand'] = te.transform([brand])[0]
    
    input_df = pd.DataFrame([input_data])
    predicted = xgb_model.predict(input_df)
    return le_product_type.inverse_transform(predicted)

# Example usage
example_effects = ["hydrating", "soothing"]
example_skin = ["dry"]
example_brand = "BrandX"

print("Predicted Product Type:", predict_product_type(example_effects, example_skin, example_brand))


Optimized XGBoost Model Accuracy: 56.20%


KeyError: 'brand'

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "Dataset/processed_file.csv"
df = pd.read_csv(file_path)

# Selecting relevant features
X = df[['brand']].copy()

# Encoding categorical variables
X = pd.get_dummies(X, columns=['brand'])

# Adding binary encoded columns for notable effects and skin type
notable_effects_columns = [col for col in df.columns if col.startswith('notable_effects_')]
skin_type_columns = ['Sensitive', 'Combination', 'Oily', 'Dry', 'Normal']
X[notable_effects_columns + skin_type_columns] = df[notable_effects_columns + skin_type_columns]

# Scaling numerical features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Encoding product_type
y = LabelEncoder().fit_transform(df['product_type'].astype(str))

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Making predictions
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.58


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "Dataset/processed_file.csv"
df = pd.read_csv(file_path)

# Selecting relevant features
X = df[['brand']].copy()

# Encoding categorical variables
X = pd.get_dummies(X, columns=['brand'])

# Adding binary encoded columns for notable effects and skin type
notable_effects_columns = [col for col in df.columns if col.startswith('notable_effects_')]
skin_type_columns = ['Sensitive', 'Combination', 'Oily', 'Dry', 'Normal']
X[notable_effects_columns + skin_type_columns] = df[notable_effects_columns + skin_type_columns]

# Scaling numerical features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Encoding product_type
y = LabelEncoder().fit_transform(df['product_type'].astype(str))

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training Support Vector Machine Classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Making predictions
y_pred = svm_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {accuracy:.2f}')


SVM Accuracy: 0.55
