In [5]:
# Import necessary libraries
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
file_path = 'C:/Users/gunde/OneDrive/Desktop/project/Dataset/food_ingredients_and_allergens(final).csv'
data = pd.read_csv(file_path)

# Prepare data
X = data.drop(columns=['Prediction'])
y = data['Prediction']

# Encode categorical features and target
X_encoded = X.apply(LabelEncoder().fit_transform)
y_encoded = LabelEncoder().fit_transform(y)

# Perform 80:20 train-test split
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with Random Forest and parameter grid, using StratifiedKFold with 3 splits
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=3),  # Use 3-fold cross-validation to avoid the warning
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV using the 80:20 training data split
grid_search.fit(X_train_80, y_train_80)

# Get the best parameters and accuracy from the grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the best model on the full 80:20 training data and test it
best_model = grid_search.best_estimator_
train_accuracy = accuracy_score(y_train_80, best_model.predict(X_train_80)) * 100
test_accuracy = accuracy_score(y_test_80, best_model.predict(X_test_80)) * 100

# Print results
print("Best Parameters:", best_params)
print("Best Cross-Validated Score:", best_score)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)



Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best Cross-Validated Score: 0.9811614647622408
Training Accuracy: 99.37304075235109
Testing Accuracy: 98.75


In [6]:
# Import necessary libraries
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
file_path = 'C:/Users/gunde/OneDrive/Desktop/project/Dataset/food_ingredients_and_allergens(final).csv'
data = pd.read_csv(file_path)

# Prepare data
X = data.drop(columns=['Prediction'])
y = data['Prediction']

# Encode categorical features and target
X_encoded = X.apply(LabelEncoder().fit_transform)
y_encoded = LabelEncoder().fit_transform(y)

# Perform 80:20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Best parameters from previous GridSearchCV
best_params = {
    'n_estimators': 100,   # Example: replace with grid_search.best_params_['n_estimators']
    'max_depth': 20,       # Example: replace with grid_search.best_params_['max_depth']
    'min_samples_split': 5,# Example: replace with grid_search.best_params_['min_samples_split']
    'min_samples_leaf': 2  # Example: replace with grid_search.best_params_['min_samples_leaf']
}

# Initialize and train a new RandomForestClassifier with best parameters
best_model = RandomForestClassifier(
    random_state=42,
    **best_params
)
best_model.fit(X_train, y_train)

# Evaluate the retrained model on both training and test sets
train_accuracy = accuracy_score(y_train, best_model.predict(X_train)) * 100
test_accuracy = accuracy_score(y_test, best_model.predict(X_test)) * 100

# Print retrained model results
print("Retrained Model - Training Accuracy:", train_accuracy)
print("Retrained Model - Testing Accuracy:", test_accuracy)


Retrained Model - Training Accuracy: 98.7460815047022
Retrained Model - Testing Accuracy: 98.75


In [7]:
# Import necessary libraries
import warnings
import pickle  # Import pickle for saving the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
file_path = 'C:/Users/gunde/OneDrive/Desktop/project/Dataset/food_ingredients_and_allergens(final).csv'
data = pd.read_csv(file_path)

# Prepare data
X = data.drop(columns=['Prediction'])
y = data['Prediction']

# Encode categorical features and target
X_encoded = X.apply(LabelEncoder().fit_transform)
y_encoded = LabelEncoder().fit_transform(y)

# Perform 80:20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Best parameters from previous GridSearchCV (replace these with the actual best parameters found)
best_params = {
    'n_estimators': 100,   # Example: replace with grid_search.best_params_['n_estimators']
    'max_depth': 20,       # Example: replace with grid_search.best_params_['max_depth']
    'min_samples_split': 5,# Example: replace with grid_search.best_params_['min_samples_split']
    'min_samples_leaf': 2  # Example: replace with grid_search.best_params_['min_samples_leaf']
}

# Initialize and train a new RandomForestClassifier with best parameters
best_model = RandomForestClassifier(
    random_state=42,
    **best_params
)
best_model.fit(X_train, y_train)

# Evaluate the retrained model on both training and test sets
train_accuracy = accuracy_score(y_train, best_model.predict(X_train)) * 100
test_accuracy = accuracy_score(y_test, best_model.predict(X_test)) * 100

# Print retrained model results
print("Retrained Model - Training Accuracy:", train_accuracy)
print("Retrained Model - Testing Accuracy:", test_accuracy)

# Save the retrained model as a pickle file
model_path = 'retrained_random_forest_model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Model saved to {model_path}")

# Load the model from the pickle file to verify
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Verify by making predictions with the loaded model (optional)
loaded_test_accuracy = accuracy_score(y_test, loaded_model.predict(X_test)) * 100
print("Loaded Model - Testing Accuracy:", loaded_test_accuracy)


Retrained Model - Training Accuracy: 98.7460815047022
Retrained Model - Testing Accuracy: 98.75
Model saved to retrained_random_forest_model.pkl
Loaded Model - Testing Accuracy: 98.75


In [11]:
import pandas as pd

# Load the uploaded dataset
file_path = 'C:/Users/gunde/Downloads/sample (1).csv'
uploaded_data = pd.read_csv(file_path)

# Display the first few rows to understand the structure
uploaded_data.head(), uploaded_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Food Product                10 non-null     object 
 1   Main Ingredient             10 non-null     object 
 2   Sweetener                   5 non-null      object 
 3   Fat/Oil                     9 non-null      object 
 4   Seasoning                   10 non-null     object 
 5   Allergens                   5 non-null      object 
 6   Price ($)                   10 non-null     float64
 7   Customer rating (Out of 5)  10 non-null     float64
dtypes: float64(2), object(6)
memory usage: 772.0+ bytes


(            Food Product Main Ingredient      Sweetener       Fat/Oil  \
 0  Blueberry Protein Bar     Blueberries          Honey    Almond oil   
 1     Mango Chia Pudding           Mango  Coconut sugar  Coconut milk   
 2         Beetroot Chips        Beetroot            NaN     Olive oil   
 3   Pumpkin Spice Muffin         Pumpkin    Maple syrup        Butter   
 4            Lentil Soup         Lentils            NaN     Olive oil   
 
            Seasoning     Allergens  Price ($)  Customer rating (Out of 5)  
 0    Vanilla extract     Tree nuts       2.99                         4.2  
 1           Cardamom       Coconut       4.49                         4.5  
 2           Sea salt           NaN       3.25                         3.8  
 3   Cinnamon, Nutmeg  Wheat, Dairy       5.75                         4.7  
 4  Herbs de Provence           NaN       4.00                         4.0  ,
 None)

In [15]:
# Install necessary library
try:
    from category_encoders import TargetEncoder, LeaveOneOutEncoder
except ModuleNotFoundError:
    !pip install category-encoders
    from category_encoders import TargetEncoder, LeaveOneOutEncoder

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Step 2: Load the Dataset
file_path = 'C:/Users/gunde/Downloads/sample (1).csv'  # Adjust the path to your dataset
data = pd.read_csv(file_path)

# Assume a target column named 'Target' for this example. Replace or adjust as needed.
data['Target'] = (data['Customer rating (Out of 5)'] > 4.0).astype(int)

X = data.drop(columns=['Target'])
y = data['Target']

# Step 3: Separate Categorical Columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Apply TargetEncoder or LeaveOneOutEncoder
# Select an encoder: Either TargetEncoder or LeaveOneOutEncoder
encoder = TargetEncoder()  # or LeaveOneOutEncoder()

# Fit the encoder on training data
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)

# Apply the same encoder to the test data
X_test_encoded = X_test.copy()
X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

# Save the encoder for future use
encoder_path = 'categorical_encoder.pkl'
joblib.dump(encoder, encoder_path)
print(f"Encoder saved to {encoder_path}")

# Step 6: Train a Model
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=20)
model.fit(X_train_encoded, y_train)

# Save the model
model_path = 'random_forest_model.pkl'
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

# Step 7: Evaluate the Model
train_accuracy = accuracy_score(y_train, model.predict(X_train_encoded)) * 100
test_accuracy = accuracy_score(y_test, model.predict(X_test_encoded)) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Testing Accuracy: {test_accuracy:.2f}%")

# Step 8: Load and Predict on Unseen Data (Optional)
# Load the saved encoder and model
encoder = joblib.load(encoder_path)
model = joblib.load(model_path)

# Apply encoder and make predictions on new data
unseen_data = X_test.copy()  # Replace with actual unseen data
unseen_data_encoded = unseen_data.copy()
unseen_data_encoded[categorical_cols] = encoder.transform(unseen_data[categorical_cols])

predictions = model.predict(unseen_data_encoded)
print(f"Predictions: {predictions}")


Encoder saved to categorical_encoder.pkl
Model saved to random_forest_model.pkl
Training Accuracy: 100.00%
Testing Accuracy: 100.00%
Predictions: [1 1]
