Rain Forest

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load the dataset
df = pd.read_csv("Dataset/processed_file.csv")
# Store text-based columns separately for recommendations
text_columns = ['product_name', 'brand', 'product_type', 'description', 'product_href', 'picture_src']
text_data = df[text_columns]
# Ignore warnings from OneHotEncoder
warnings.simplefilter(action='ignore', category=FutureWarning)
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')

categorical_cols = ['product_type', 'brand', 'notable_effects']
encoded_features = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))
encoded_features.columns = encoder.get_feature_names_out(categorical_cols)

# Label encode skintype if it exists
df['skintype'] = LabelEncoder().fit_transform(df['skintype']) if 'skintype' in df else df['skintype']

# Merge encoded features back to the dataset
df = df.drop(columns=categorical_cols).join(encoded_features)

# Define features (X) and target (y), keeping only numeric columns for training
X = df.select_dtypes(include=['number']).drop(columns=['labels'])  # Features: skin type, price, and notable effects
y = df['labels']  # Target: Product category labels

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Model Accuracy: {accuracy:.2%}")

# Function to retrieve product recommendations
def recommend_products(predictions, df, text_data):
    recommended_products = df[df['labels'].isin(predictions)].reset_index(drop=True)
    return recommended_products[text_columns]


# Example usage (for the test set predictions)
recommended = recommend_products(y_pred, df, text_data)
print(recommended.head())


Random Forest Model Accuracy: 88.84%


KeyError: "['brand', 'product_type'] not in index"