In [1]:
import pandas as pd

file_path = r"C:\Users\ariji\Downloads\Allergen_Status_of_Food_Products.csv"
data = pd.read_csv(file_path)

# Ensure all required columns exist
required_columns = ['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', 'Allergens', 'Prediction']
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    raise ValueError(f"Missing required columns in dataset: {missing_columns}")


In [2]:
# Fill missing values in specific columns
fill_cols = ['Prediction', 'Sweetener', 'Fat/Oil', 'Seasoning', 'Allergens']
for col in fill_cols:
    data[col].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna('Unknown', inplace=True)


In [4]:
print(data.isnull().sum())

Food Product                  0
Main Ingredient               0
Sweetener                     0
Fat/Oil                       0
Seasoning                     0
Allergens                     0
Price ($)                     0
Customer rating (Out of 5)    0
Prediction                    0
dtype: int64


In [5]:
data['Prediction'] = data['Prediction'].map({'Contains': 1, 'Does not contain': 0})
data = data.dropna(subset=['Prediction'])  # Drop rows with NaN in Prediction


In [6]:
print("Unique values in 'Prediction':", data['Prediction'].unique())


Unique values in 'Prediction': [1. 0.]


In [7]:
valid_predictions = {'Contains', 'Does not contain'}
unexpected_values = set(data['Prediction']) - valid_predictions
if unexpected_values:
    print(f"Unexpected values in 'Prediction': {unexpected_values}")


Unexpected values in 'Prediction': {0.0, 1.0}


In [8]:
import category_encoders as ce

# Define categorical columns and target column
cat_cols = ['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', 'Allergens']
target_col = 'Prediction'

# Initialize LeaveOneOutEncoder
encoder = ce.LeaveOneOutEncoder(cols=cat_cols)

# Fit and transform the data (requires target variable)
encoded_data = encoder.fit_transform(data[cat_cols], data[target_col])

# Merge the encoded columns with the rest of the dataset
final_data = pd.concat([data.drop(columns=cat_cols), encoded_data], axis=1)


In [9]:
import joblib

# Save the encoder
encoder_path = r'C:\Users\ariji\SafeBite\Models\encoder2.pkl'
joblib.dump(encoder, encoder_path)
print(f"Encoder saved to {encoder_path}")

# Save the final encoded dataset
output_path = r'C:\Users\ariji\SafeBite\Datasets\Allergen_Status_Encoded2.csv'
final_data.to_csv(output_path, index=False)
print(f"Encoded dataset saved to {output_path}")


Encoder saved to C:\Users\ariji\SafeBite\Models\encoder2.pkl
Encoded dataset saved to C:\Users\ariji\SafeBite\Datasets\Allergen_Status_Encoded2.csv


In [10]:
print(encoded_data.head())
print("Encoder categories:", encoder.get_params())


   Food Product  Main Ingredient  Sweetener   Fat/Oil  Seasoning  Allergens
0      1.000000              1.0   0.890110  0.940476        1.0        1.0
1      1.000000              1.0   0.890110  0.940476        1.0        1.0
2      0.333333              1.0   0.543165  0.631579        0.9        1.0
3      0.333333              1.0   0.543165  0.631579        0.9        1.0
4      0.640704              1.0   0.543165  0.631579        0.9        1.0
Encoder categories: {'cols': ['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', 'Allergens'], 'drop_invariant': False, 'handle_missing': 'value', 'handle_unknown': 'value', 'random_state': None, 'return_df': True, 'sigma': None, 'verbose': 0}


In [11]:
print("Columns in final dataset:", final_data.columns)


Columns in final dataset: Index(['Price ($)', 'Customer rating (Out of 5)', 'Prediction', 'Food Product',
       'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', 'Allergens'],
      dtype='object')
