In [179]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import random


In [180]:
# Set a fixed seed for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

In [181]:
# Load dataset from a CSV file (replace 'your_dataset.csv' with your actual file path)
data = pd.read_csv('data/data2.csv')

In [182]:
# Check the first few rows of the dataset to ensure it's loaded correctly
print(data.head())

   sugar_level  cholesterol_level  blood_pressure        bmi        age  \
0   118.712487         253.645731      106.515809  29.434545  44.882622   
1   193.282236         230.564779      139.135057  28.416377  24.235973   
2   164.398159         196.328976      168.928348  18.607971  51.982496   
3   147.493978         271.970109      156.382575  25.423227  71.995713   
4    90.032340         252.574609      162.634960  26.401218  44.956109   

   heart_rate  sugar_in_product  salt_in_product  saturated_fat_in_product  \
0   73.492264         24.729827         4.756017                  9.752166   
1   66.612844         11.479532         3.890672                  2.341860   
2   66.875104          2.286215         3.058197                 13.669780   
3   98.047706         20.834790         4.300510                 18.888536   
4   77.725953         15.029225         3.349412                  2.715371   

   carbohydrates_in_product  consumption_score  
0                 98.883446    

In [183]:
# Ensure that the dataset has the expected columns
expected_columns = ['sugar_level', 'cholesterol_level', 'blood_pressure', 'bmi', 'age', 'heart_rate', 
                    'sugar_in_product', 'salt_in_product', 'saturated_fat_in_product', 
                    'carbohydrates_in_product', 'consumption_score']

In [184]:
# Split dataset into features (X) and target (y)
X = data.drop(columns=['consumption_score'])
y = data['consumption_score']


In [185]:
# You can use this to check if any column is missing
missing_columns = [col for col in expected_columns if col not in data.columns]
if missing_columns:
    print(f"Missing columns in dataset: {missing_columns}")
else:
    print("All required columns are present.")

All required columns are present.


In [186]:
# Split dataset into features (X) and target (y)
X = data.drop(columns=['consumption_score'])
y = data['consumption_score']

In [187]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_value)

In [188]:
# Train XGBoost Regressor
model = XGBRegressor(random_state=seed_value)
model.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

In [189]:
# Predictions
y_pred = model.predict(X_test)

In [190]:
# Evaluation (MSE and R²)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error: 6.40
R² Score: 0.99


In [191]:
# Example User Input Prediction Function
def predict_safety_score(user_input):
    """
    Predicts a consumption score for a product using the trained XGBoost model.
    Ensures output is an integer between 0 and 100 and is deterministic.
    """
    user_df = pd.DataFrame([user_input])
    score = model.predict(user_df)[0]
    score = np.clip(score, 0, 100)  # Clip to range [0, 100]
    return int(round(score))  # Return as an integer

In [192]:
# Example User Input
user_input = {
    'sugar_level': 150,
    'cholesterol_level': 220,
    'blood_pressure': 130,
    'bmi': 25,
    'age': 45,
    'heart_rate': 75,
    'sugar_in_product': 30,
    'salt_in_product': 3,
    'saturated_fat_in_product': 15,
    'carbohydrates_in_product': 60
}

In [193]:
# Prediction
score = predict_safety_score(user_input)
print(f"Predicted Consumption Score: {score}")

Predicted Consumption Score: 38


In [194]:
import joblib

# Save the model
joblib.dump(model, 'xgboost_model.pkl')




['xgboost_model.pkl']