In [1]:
import pandas as pd

# Load the dataset
file_path = 'E:/my files/project/food-allergen/Allergen_Status_of_Food_Products.csv'   
data = pd.read_csv(file_path,keep_default_na=False, na_values = [""])
'''
keep_default_na=False:
By default, pandas treats certain strings (e.g., 'NA', 'N/A', 'NULL') as NaN when reading data.
Setting keep_default_na=False disables this behavior, meaning only the strings explicitly specified in na_values will be treated as NaN.

na_values=[""]:
Specifies a list of additional strings to be treated as NaN. In this case, only empty strings ("") are treated as NaN.
'''

'\nkeep_default_na=False:\nBy default, pandas treats certain strings (e.g., \'NA\', \'N/A\', \'NULL\') as NaN when reading data.\nSetting keep_default_na=False disables this behavior, meaning only the strings explicitly specified in na_values will be treated as NaN.\n\nna_values=[""]:\nSpecifies a list of additional strings to be treated as NaN. In this case, only empty strings ("") are treated as NaN.\n'

In [2]:
# Display the first few rows
print("Dataset Preview:")
data.head()

Dataset Preview:


Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,Contains
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,Contains
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,Contains
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,Contains


In [3]:
                                                            # Handling Missing value(Drop or impute)
# checking for missing values
print("Missing Values Summary:")
print(data.isnull().sum())

Missing Values Summary:
Food Product                  0
Main Ingredient               0
Sweetener                     0
Fat/Oil                       0
Seasoning                     0
Allergens                     0
Price ($)                     0
Customer rating (Out of 5)    0
Prediction                    1
dtype: int64


In [4]:
# Identify rows with any missing values 
missing_values = data.isna()
# Drop rows with any missing values
dropped_rows = data[missing_values.any(axis=1)]
print("Rows Dropped Due to Missing Values:")
dropped_rows

Rows Dropped Due to Missing Values:


Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
338,Baked Ziti,Pasta,,Cheese,Tomato sauce,"Wheat, Dairy",14.3,4.3,


In [5]:
# Dropping rows with any missing values
data_dropped_rows = data.dropna()
print("Final Dataset After Dropping Rows with Missing Values:")
data_dropped_rows

Final Dataset After Dropping Rows with Missing Values:


Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,Contains
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,Contains
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,Contains
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,Contains
...,...,...,...,...,...,...,...,...,...
394,Lemon Bars,Lemon juice,Sugar,Butter,"Flour, eggs","Wheat, Dairy, Eggs",5.07,2.9,Contains
395,Pecan Pie,Pecans,Sugar,Butter,Corn syrup,"Wheat, Dairy, Nuts",11.95,4.4,Contains
396,Zucchini Bread,Zucchini,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",12.67,3.4,Contains
397,Banana Bread,Bananas,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",15.83,2.4,Contains


In [6]:
# Imputation - Mean for numeric, Mode for categorical
data_imputed = data.copy()
# Fill numeric columns with mean
numeric_cols = data.select_dtypes(include=['number']).columns
data_imputed[numeric_cols] = data_imputed[numeric_cols].fillna(data[numeric_cols].mean())

print("Final Dataset After Mean:")
# data_imputed.head(250)
data_imputed

Final Dataset After Mean:


Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,Contains
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,Contains
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,Contains
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,Contains
...,...,...,...,...,...,...,...,...,...
394,Lemon Bars,Lemon juice,Sugar,Butter,"Flour, eggs","Wheat, Dairy, Eggs",5.07,2.9,Contains
395,Pecan Pie,Pecans,Sugar,Butter,Corn syrup,"Wheat, Dairy, Nuts",11.95,4.4,Contains
396,Zucchini Bread,Zucchini,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",12.67,3.4,Contains
397,Banana Bread,Bananas,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",15.83,2.4,Contains


In [7]:
                                                            # Check for duplicates
duplicates = data_imputed[data_imputed.duplicated()]
if duplicates.empty:
    print("No duplicates found in the dataset.")
else:
    print("Duplicates found:")
    print(duplicates)
    
    # Drop duplicates
    data_no_duplicates = data_imputed.drop_duplicates()
    print("\nDataset after removing duplicates:")
    print(data_no_duplicates)

No duplicates found in the dataset.


In [8]:
                                                            # check for outliers
                        # "Price ($)" column, the lower bound is -2.53, and the upper bound is 27.29. 
                        # "Customer rating (Out of 5)" column, the lower bound is -1.25, and the upper bound is 7.15. 
# Function to find outliers using IQR(Interquartile Range) method
def find_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Identify outliers in "Price ($)" and "Customer rating (Out of 5)"
outliers_price, lower_price, upper_price = find_outliers_iqr(data_imputed, "Price ($)")   # changed data to data_imputed
outliers_rating, lower_rating, upper_rating = find_outliers_iqr(data_imputed, "Customer rating (Out of 5)")   # changed data to data_imputed

# Check if there are any outliers and print the message
if outliers_price.empty and outliers_rating.empty:
    print("No outliers are present in 'Price ($)' and 'Customer rating (Out of 5)'.")
else:
    print("Outliers detected!")
    if not outliers_price.empty:
        print("Outliers in 'Price ($)':")
        print(outliers_price)
    if not outliers_rating.empty:
        print("Outliers in 'Customer rating (Out of 5)':")
        print(outliers_rating)


No outliers are present in 'Price ($)' and 'Customer rating (Out of 5)'.


In [9]:
'''
                                    Encoding in Machine Learning
Definition: Converting categorical data into numerical data so that algorithms can process it.
Examples:
One-Hot Encoding: Represents categories as binary vectors.
Label Encoding: Assigns a unique integer to each category.

ex1: 
from sklearn.preprocessing import LabelEncoder
data = ['cat', 'dog', 'fish']
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(data)
print(encoded_data)  # Output: [0, 1, 2]

ex2:
One-hot encoding is a method used to convert categorical data into a binary matrix (a series of 0s and 1s) that represents each category 
as a separate column. It is commonly used in machine learning to handle categorical variables in a way that algorithms can process.

Example Scenario:
Imagine you have a dataset with a categorical column "Color" containing three unique values: Red, Blue, and Green.

Original Data:
Color
Red
Blue
Green
Red
Blue

One-Hot Encoded Representation:
Red	Blue	Green
1	0	0
0	1	0
0	0	1
1	0	0
0	1	0

Here:
Red is represented as [1, 0, 0].
Blue is represented as [0, 1, 0].
Green is represented as [0, 0, 1].
'''

'\n                                    Encoding in Machine Learning\nDefinition: Converting categorical data into numerical data so that algorithms can process it.\nExamples:\nOne-Hot Encoding: Represents categories as binary vectors.\nLabel Encoding: Assigns a unique integer to each category.\n\nex1: \nfrom sklearn.preprocessing import LabelEncoder\ndata = [\'cat\', \'dog\', \'fish\']\nencoder = LabelEncoder()\nencoded_data = encoder.fit_transform(data)\nprint(encoded_data)  # Output: [0, 1, 2]\n\nex2:\nOne-hot encoding is a method used to convert categorical data into a binary matrix (a series of 0s and 1s) that represents each category \nas a separate column. It is commonly used in machine learning to handle categorical variables in a way that algorithms can process.\n\nExample Scenario:\nImagine you have a dataset with a categorical column "Color" containing three unique values: Red, Blue, and Green.\n\nOriginal Data:\nColor\nRed\nBlue\nGreen\nRed\nBlue\n\nOne-Hot Encoded Represent

In [10]:
data_imputed.head(250)

Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,Contains
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,Contains
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,Contains
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,Contains
...,...,...,...,...,...,...,...,...,...
245,Spinach Salad,Spinach,,Olive oil,Balsamic vinaigrette,,6.55,1.2,Does not contain
246,Caramelized Onions,Onions,Sugar,Butter,,,8.10,4.2,Does not contain
247,Beef Stir-Fry,Beef,,Vegetable oil,Soy sauce,,16.70,4.8,Does not contain
248,Strawberry Spinach Salad,Spinach,,,"Strawberries, vinaigrette",,11.52,2.0,Does not contain


In [11]:
                                                                # Encoding
from sklearn.preprocessing import LabelEncoder
# 1. Label/Binary Encoding for "Prediction" column
label_encoder = LabelEncoder()
data_imputed['Prediction'] = label_encoder.fit_transform(data_imputed['Prediction'])   
# Map 'Prediction' to 1 for 'Contains' and 0 for 'Does Not Contain'
data_imputed['Prediction'] = data_imputed['Prediction'].map({
    1: 0,
    0: 1
})

# Fill NaN values with 0 (or any other appropriate value)
data_imputed['Prediction'] = data_imputed['Prediction'].fillna(0).astype(int)
data_imputed

Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,1
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,1
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,1
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,1
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,1
...,...,...,...,...,...,...,...,...,...
394,Lemon Bars,Lemon juice,Sugar,Butter,"Flour, eggs","Wheat, Dairy, Eggs",5.07,2.9,1
395,Pecan Pie,Pecans,Sugar,Butter,Corn syrup,"Wheat, Dairy, Nuts",11.95,4.4,1
396,Zucchini Bread,Zucchini,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",12.67,3.4,1
397,Banana Bread,Bananas,Sugar,Butter,"Cinnamon, nuts","Wheat, Dairy, Nuts",15.83,2.4,1


In [12]:
from category_encoders import LeaveOneOutEncoder

# Columns to apply Leave-One-Out Encoding
columns = ["Food Product", "Main Ingredient", "Sweetener", "Fat/Oil", "Seasoning", "Allergens"]


# Initialize Leave-One-Out Encoder
encoder = LeaveOneOutEncoder(cols=columns)

# Apply the encoder on the dataset
data_imputed_encoded = encoder.fit_transform(data_imputed[columns], data_imputed['Price ($)'])  # Assuming 'target' is your target column
data_imputed = pd.concat([data_imputed.drop(columns, axis=1), data_imputed_encoded], axis=1)
   

# Display the first few rows of the dataset with Leave-One-Out encoded columns
data_imputed.head(250)

Unnamed: 0,Price ($),Customer rating (Out of 5),Prediction,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
0,10.15,3.1,1,6.170000,6.170000,12.343736,12.471786,13.408000,6.170000
1,6.17,4.5,1,10.150000,10.150000,12.387473,12.519167,13.806000,10.150000
2,19.65,4.1,1,12.500000,17.480000,12.378423,12.031228,11.584000,17.480000
3,17.48,4.7,1,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
4,10.83,3.7,1,12.402356,17.925000,12.410036,12.185965,12.466000,12.072073
...,...,...,...,...,...,...,...,...,...
245,6.55,1.2,0,14.610000,10.967000,12.425376,13.167528,12.850000,12.203810
246,8.10,4.2,0,11.820000,10.187500,12.366264,12.496190,13.009474,12.193265
247,16.70,4.8,0,14.015000,14.148571,12.388996,11.695952,12.812143,12.134762
248,11.52,2.0,0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000


In [13]:
# 1. Split your dataset into X (features) and y (target variable).
# Define X as all columns except the target column (e.g., 'Prediction') and y as the target column
X = data_imputed.drop(columns=['Prediction'])  # Replace 'Prediction' with your target variable's name if different
y = data_imputed['Prediction']

# 2. Print the shapes of X and y
print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

Shape of X (features): (399, 8)
Shape of y (target): (399,)


In [14]:
from sklearn.model_selection import train_test_split

# 3. Perform 80:20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the split data
print("Shape of X_train (80%):", X_train.shape)
print("Shape of X_test (20%):", X_test.shape)
print("Shape of y_train (80%):", y_train.shape)
print("Shape of y_test (20%):", y_test.shape)

Shape of X_train (80%): (319, 8)
Shape of X_test (20%): (80, 8)
Shape of y_train (80%): (319,)
Shape of y_test (20%): (80,)


In [15]:
# # 4. Perform 70:30 train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Print the shapes of the split data
# print("Shape of X_train (70%):", X_train.shape)
# print("Shape of X_test (30%):", X_test.shape)
# print("Shape of y_train (70%):", y_train.shape)
# print("Shape of y_test (30%):", y_test.shape)

In [16]:
                                                    # Accuracy using Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# # Separating the features and target variable
# X = data.drop(columns=["Prediction"])  # Features
# y = data["Prediction"]                 # Target

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the Logistic Regression model
    model = LogisticRegression(max_iter=1000, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using logistic regression")
# Get accuracies for 70:30 split
# train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
# print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")

Training and testing accuracy using logistic regression
80:20 Split - Training Accuracy: 0.59, Testing Accuracy: 0.65


In [17]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [18]:
                                                            # Accuracy using Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Separating the features and target variable
# X = data.drop(columns=["Prediction_Encoded"])  # Features
# y = data["Prediction_Encoded"]                 # Target

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the Random Forest model
    model = RandomForestClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using random forest")
# # Get accuracies for 70:30 split
# train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
# print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")

Training and testing accuracy using random forest
80:20 Split - Training Accuracy: 1.00, Testing Accuracy: 0.99


In [19]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [20]:
                                                            # Accuracy using Decision Tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the Decision Tree model
    model = DecisionTreeClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using Decision Tree")
# Get accuracies for 70:30 split
train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")

Training and testing accuracy using Decision Tree
70:30 Split - Training Accuracy: 1.00, Testing Accuracy: 0.94
80:20 Split - Training Accuracy: 1.00, Testing Accuracy: 0.96


In [21]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [22]:
                                                            # Accuracy using KNN Model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size, n_neighbors=5):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the KNN model
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using KNN Model")
# # Get accuracies for 70:30 split
# train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
# print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")


Training and testing accuracy using KNN Model
80:20 Split - Training Accuracy: 0.71, Testing Accuracy: 0.71


In [23]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [24]:
                                                            # Accuracy using Adaboost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the AdaBoost model
    model = AdaBoostClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using Adaboost")
# # Get accuracies for 70:30 split
# train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
# print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")


Training and testing accuracy using Adaboost
80:20 Split - Training Accuracy: 1.00, Testing Accuracy: 0.99




In [25]:
                                                            # Accuracy using XGBoost
# pip install xgboost
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define a function to train, test, and get accuracy scores
def get_accuracy(X, y, test_size):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the XGBoost model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return train_accuracy, test_accuracy

print("Training and testing accuracy using XGBoost")
# Get accuracies for 70:30 split
train_accuracy_70, test_accuracy_70 = get_accuracy(X, y, test_size=0.3)
print(f"70:30 Split - Training Accuracy: {train_accuracy_70:.2f}, Testing Accuracy: {test_accuracy_70:.2f}")

# Get accuracies for 80:20 split
train_accuracy_80, test_accuracy_80 = get_accuracy(X, y, test_size=0.2)
print(f"80:20 Split - Training Accuracy: {train_accuracy_80:.2f}, Testing Accuracy: {test_accuracy_80:.2f}")


Training and testing accuracy using XGBoost
70:30 Split - Training Accuracy: 1.00, Testing Accuracy: 0.98
80:20 Split - Training Accuracy: 1.00, Testing Accuracy: 0.99


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [26]:
X_test

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
198,6.26,3.0,12.402356,11.914833,12.426416,13.170787,12.402356,12.205782
349,11.87,1.3,5.980000,12.875455,12.324835,12.451310,10.663333,13.084789
33,8.67,4.9,13.930000,14.760000,12.417778,12.489405,12.402356,12.098415
208,5.65,3.9,17.430000,13.085000,12.428602,13.177640,12.440000,12.209932
93,13.79,2.2,12.402356,12.402356,12.399427,12.134035,12.402356,13.470000
...,...,...,...,...,...,...,...,...
249,7.87,4.3,17.290000,11.645000,12.420645,13.152697,17.290000,12.194830
225,6.28,3.1,12.402356,11.914500,12.426344,13.170562,11.444000,12.205646
368,5.16,1.3,10.905000,11.531538,12.430358,12.285439,13.872500,13.179296
175,10.32,3.8,12.402356,13.016364,12.402356,12.194912,12.892632,12.178163


In [27]:

# # Define the models and their results
# models = ['Logistic Regression', 'Random Forest', 'Decision Tree', 'KNN', 'AdaBoost', 'XGBoost']
# split_70_30_train = [log_reg_train_70, rf_train_70, dt_train_70, knn_train_70, ada_train_70, xgb_train_70]
# split_70_30_test = [log_reg_test_70, rf_test_70, dt_test_70, knn_test_70, ada_test_70, xgb_test_70]
# split_80_20_train = [log_reg_train_80, rf_train_80, dt_train_80, knn_train_80, ada_train_80, xgb_train_80]
# split_80_20_test = [log_reg_test_80, rf_test_80, dt_test_80, knn_test_80, ada_test_80, xgb_test_80]

# # Replace log_reg_train_70, rf_train_70, dt_train_70, etc., with the actual accuracy values you obtained from training and testing each model.

# # Create a DataFrame to display results
# results_df = pd.DataFrame({
#     'Model': models,
#     '70:30 Split Train Accuracy': split_70_30_train,
#     '70:30 Split Test Accuracy': split_70_30_test,
#     '80:20 Split Train Accuracy': split_80_20_train,
#     '80:20 Split Test Accuracy': split_80_20_test
# })

# # Display the results
# print(results_df)


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Example models (use your actual models here)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb


# Dictionary to hold models and their names
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': xgb.XGBClassifier(),
}

# Example data (replace with your actual dataset)
# Assuming you already have X (features) and y (target)
# X, y = your_data

# Split the data for 70:30 and 80:20 splits
# X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize lists to hold the accuracy results
split_70_30_train = []
split_70_30_test = []
split_80_20_train = []
split_80_20_test = []

# Loop through the models and fit them, then collect accuracy
for model_name, model in models.items():
    # 70:30 split results
#     model.fit(X_train_70, y_train_70)
#     split_70_30_train.append(model.score(X_train_70, y_train_70))  # Training accuracy
#     split_70_30_test.append(model.score(X_test_70, y_test_70))  # Testing accuracy
    
    # 80:20 split results
    model.fit(X_train_80, y_train_80)
    split_80_20_train.append(model.score(X_train_80, y_train_80))  # Training accuracy
    split_80_20_test.append(model.score(X_test_80, y_test_80))  # Testing accuracy

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'Model': list(models.keys()),
#     '70:30 Split Train Accuracy': split_70_30_train,
#     '70:30 Split Test Accuracy': split_70_30_test,
    '80:20 Split Train Accuracy': split_80_20_train,
    '80:20 Split Test Accuracy': split_80_20_test
})

# Display the results
results_df




Unnamed: 0,Model,80:20 Split Train Accuracy,80:20 Split Test Accuracy
0,Logistic Regression,0.589342,0.65
1,Random Forest,1.0,0.9875
2,Decision Tree,1.0,0.95
3,KNN,0.711599,0.7125
4,AdaBoost,1.0,0.9875
5,XGBoost,1.0,0.9875


In [29]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [30]:
                                                    #Hyper-parameter tuning
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Generate synthetic data for demonstration
# X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing the XGBoost classifier
xgb = XGBClassifier(random_state=42)

# Defining the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 100]
}

# Setting up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Running the grid search
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)

# Train the best model
best_xgb = grid_search.best_estimator_

# Evaluate on training and testing sets
train_accuracy = accuracy_score(y_train, best_xgb.predict(X_train))
test_accuracy = accuracy_score(y_test, best_xgb.predict(X_test))

print("Best XGBoost Model Training Accuracy:", train_accuracy)
print("Best XGBoost Model Testing Accuracy:", test_accuracy)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}
Best XGBoost Model Training Accuracy: 0.978494623655914
Best XGBoost Model Testing Accuracy: 0.9833333333333333


In [31]:
                                                #Retraining it with the best parameter
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Generate synthetic data for demonstration
# X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Best parameters from GridSearchCV
best_params = {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 50, 
               'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}

print("Using Best Parameters for Retraining:", best_params)

# Retrain the model with the best parameters
final_model = XGBClassifier(**best_params, random_state=42, use_label_encoder=False)

# Fit the model
final_model.fit(X_train, y_train)

# Evaluate the final model
train_accuracy = accuracy_score(y_train, final_model.predict(X_train))
test_accuracy = accuracy_score(y_test, final_model.predict(X_test))

print(f"Final Model Training Accuracy: {train_accuracy:.4f}")
print(f"Final Model Testing Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, final_model.predict(X_test)))


Using Best Parameters for Retraining: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}
Final Model Training Accuracy: 0.9928
Final Model Testing Accuracy: 0.9833

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        33
           1       1.00      0.98      0.99        87

    accuracy                           0.98       120
   macro avg       0.97      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



Parameters: { "use_label_encoder" } are not used.



In [32]:
X_train_80

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
3,17.48,4.7,13.223333,19.650000,12.386201,12.069298,11.801000,19.650000
18,12.97,4.5,19.830000,9.797500,12.312747,12.152500,19.830000,12.045976
377,15.56,3.1,8.580000,10.363333,12.393082,13.066292,11.272500,13.032817
248,11.52,2.0,12.770000,10.470000,12.407563,12.173860,12.770000,12.170000
177,15.56,3.9,12.402356,12.030000,12.393082,12.102982,12.402356,12.142517
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [33]:
import pickle

# Save the retrained model to a pickle file
with open('final_xgb_model.pkl', 'wb') as model_file:
    pickle.dump(final_model, model_file)
print("Model saved as 'final_xgb_model.pkl'.")

# Load the model from the pickle file
with open('final_xgb_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Verify the loaded model by checking its performance on the test set
loaded_model_accuracy = accuracy_score(y_test, loaded_model.predict(X_test))
print("Loaded Model Testing Accuracy:", loaded_model_accuracy)


Model saved as 'final_xgb_model.pkl'.
Loaded Model Testing Accuracy: 0.9833333333333333


In [34]:
X_train

Unnamed: 0,Price ($),Customer rating (Out of 5),Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
157,6.34,3.1,12.402356,11.913500,12.426129,13.169888,12.402356,12.205238
109,11.70,4.5,9.550000,12.446000,12.406918,13.109663,10.360000,9.028333
17,17.33,3.1,14.990000,14.385000,12.386738,13.046404,12.595000,11.992805
253,7.47,2.4,18.760000,15.433636,12.422079,12.244912,18.760000,12.197551
24,11.07,4.4,15.675000,11.834667,12.409176,12.402356,12.402356,12.069146
...,...,...,...,...,...,...,...,...
71,16.93,4.6,12.402356,10.505000,12.388172,11.406538,12.402356,11.997683
106,5.96,4.2,15.930000,13.594000,12.427491,11.951667,11.990000,9.985000
270,17.80,1.3,12.790000,11.722500,12.385054,11.823333,14.585000,12.127279
348,13.56,3.3,17.680000,17.306667,14.255000,12.431190,12.402356,13.060986


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for RandomizedSearch
param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter combinations to try
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV (Assumes X_train, y_train are already defined)
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_rf = random_search.best_estimator_
best_params = random_search.best_params_
best_score = random_search.best_score_

# Evaluate the model on the test set
y_pred_test = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)


# Predict on the training data using the best model
y_pred_train = best_rf.predict(X_train)

# Calculate the training accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)

# Print results
print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None}
Best Cross-Validation Accuracy: 0.9677272727272728
Training Accuracy: 0.978494623655914
Test Accuracy: 0.9916666666666667


In [69]:
from sklearn.ensemble import RandomForestClassifier

# Retrain the model using the best parameters
best_params = {
    'n_estimators': 50,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': None,
    'max_depth': 10
}

# Initialize and train the Random Forest model
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

print("Model retrained with the best parameters.")


Model retrained with the best parameters.


In [36]:
import joblib

# Save the retrained model to a file
model_filename = "best_random_forest_model.pkl"
joblib.dump(final_model, model_filename)

joblib.dump(encoder, "leave_one_out_encoder.pkl")

# print(f"Model saved as {model_filename}")
print("Model and encoder have been saved successfully")


Model and encoder have been saved successfully
