In [3]:
import pandas as pd
import numpy as np
import ot
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Replace the filename with the uploaded file name
filename = "/home/fnkamsug/PycharmProjects/JupyterProject/HealthStarDataset/Data/Openfoodfacts.tsv"

# Read the TSV file (tab-separated)
df = pd.read_csv(filename, sep='\t', low_memory=False)

In [4]:
columns_of_interest = [

    'nutrition_grade_fr',
    'energy_100g',
    'fat_100g',
    'saturated-fat_100g',
    'carbohydrates_100g',
    'sugars_100g',
    'fiber_100g',
    'proteins_100g',
    'sodium_100g'
]

df_small = df[columns_of_interest]

# Drop rows missing any of these critical columns
df_clean = df_small.dropna()

In [5]:
covariate_columns = [
    'energy_100g',
    'fat_100g',
    'saturated-fat_100g',
    'carbohydrates_100g',
    'sugars_100g',
    'fiber_100g',
    'proteins_100g',
    'sodium_100g'
]

outcome_column = 'nutrition_grade_fr'

df_subset = df_clean[covariate_columns + [outcome_column]].dropna()
X = df_subset[covariate_columns].values.astype(float)

# Encode Nutri-Score as numeric (A=0,...,E=4)
y_raw = df_subset[outcome_column].astype(str).str.upper().values.reshape(-1,1)
encoder_y = OrdinalEncoder(categories=[['A','B','C','D','E']])
y = encoder_y.fit_transform(y_raw).ravel()

# -------------------------------
# 2️⃣ Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [6]:
loss_dict = {}
acc_dict = {}
acc_by_one_dict = {}

In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor

# Clamp predictions
def clamp(pred):
    return np.clip(pred, 0, 4)

# Storage for results
results = {}


In [8]:
name = "Linear Regression"
model = LinearRegression()

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Linear Regression


{'MSE': 0.7375608384968296,
 'R2': 0.6039617058402499,
 'Acc ±0.5': np.float64(0.43446946642320955),
 'Acc ±1': np.float64(0.727023737066342),
 'Acc ±1.5': np.float64(0.9467944816392777)}

In [9]:
name = "Ridge Regression"
model = Ridge(alpha=1.0)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Ridge Regression


{'MSE': 0.7375619710651169,
 'R2': 0.6039610977000274,
 'Acc ±0.5': np.float64(0.43446946642320955),
 'Acc ±1': np.float64(0.727023737066342),
 'Acc ±1.5': np.float64(0.9467944816392777)}

In [10]:
name = "Lasso Regression"
model = Lasso(alpha=0.01, max_iter=10000)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Lasso Regression


{'MSE': 0.7443551494989146,
 'R2': 0.6003134544705859,
 'Acc ±0.5': np.float64(0.4321616960843985),
 'Acc ±1': np.float64(0.7214191519578008),
 'Acc ±1.5': np.float64(0.9467691215256644)}

In [11]:
name = "Decision Tree"
model = DecisionTreeRegressor(max_depth=5, random_state=42)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Decision Tree


{'MSE': 0.35795990838874064,
 'R2': 0.8077909996078695,
 'Acc ±0.5': np.float64(0.6681629133698519),
 'Acc ±1': np.float64(0.9124315276932441),
 'Acc ±1.5': np.float64(0.9774548589977683)}

In [12]:
name = "K-Nearest Neighbors"
model = KNeighborsRegressor(n_neighbors=5)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: K-Nearest Neighbors


{'MSE': 0.2154585108541286,
 'R2': 0.8843080914182296,
 'Acc ±0.5': np.float64(0.8137806857374721),
 'Acc ±1': np.float64(0.9628474335565024),
 'Acc ±1.5': np.float64(0.9815124771758977)}

In [13]:
name = "Random Forest"
model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Random Forest


{'MSE': 0.3334854543027599,
 'R2': 0.8209327237081582,
 'Acc ±0.5': np.float64(0.6827196185838913),
 'Acc ±1': np.float64(0.9244268614323392),
 'Acc ±1.5': np.float64(0.9799147900182593)}

In [14]:
name = "Neural Network (MLP)"
model = MLPRegressor(hidden_layer_sizes=(64, 64), activation="relu",
                     solver="adam", max_iter=500, random_state=42)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Neural Network (MLP)


{'MSE': 0.13165003885167248,
 'R2': 0.9293096188252878,
 'Acc ±0.5': np.float64(0.8918391154392372),
 'Acc ±1': np.float64(0.9756035707039967),
 'Acc ±1.5': np.float64(0.9886133089876242)}

In [15]:
name = "XGBoost"
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

print(f"Training: {name}")
model.fit(X_train, y_train)
y_pred = clamp(model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: XGBoost


{'MSE': 0.1232012320306492,
 'R2': 0.9338462629452532,
 'Acc ±0.5': np.float64(0.9097433556502333),
 'Acc ±1': np.float64(0.9808531142219518),
 'Acc ±1.5': np.float64(0.9914536417123149)}

In [16]:
name = "Random Forest (Tuned)"
params = {
    "n_estimators": [100, 300],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5]
}

model = GridSearchCV(
    RandomForestRegressor(random_state=42),
    params,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

print(f"Training: {name}")
model.fit(X_train, y_train)

best_model = model.best_estimator_
y_pred = clamp(best_model.predict(X_test))

results[name] = {
    "MSE": mean_squared_error(y_test, y_pred),
    "R2": r2_score(y_test, y_pred),
    "Acc ±0.5": np.mean(np.abs(y_test - y_pred) <= 0.5),
    "Acc ±1": np.mean(np.abs(y_test - y_pred) <= 1),
    "Acc ±1.5": np.mean(np.abs(y_test - y_pred) <= 1.5)
}

results[name]


Training: Random Forest (Tuned)


{'MSE': 0.08371338923814274,
 'R2': 0.9550495279280643,
 'Acc ±0.5': np.float64(0.9375126800568067),
 'Acc ±1': np.float64(0.9850882531953743),
 'Acc ±1.5': np.float64(0.9917579630756745)}

In [17]:
results_df = pd.DataFrame(results).T
results_df = results_df[["MSE", "R2", "Acc ±0.5", "Acc ±1", "Acc ±1.5"]]

results_df


Unnamed: 0,MSE,R2,Acc ±0.5,Acc ±1,Acc ±1.5
Linear Regression,0.737561,0.603962,0.434469,0.727024,0.946794
Ridge Regression,0.737562,0.603961,0.434469,0.727024,0.946794
Lasso Regression,0.744355,0.600313,0.432162,0.721419,0.946769
Decision Tree,0.35796,0.807791,0.668163,0.912432,0.977455
K-Nearest Neighbors,0.215459,0.884308,0.813781,0.962847,0.981512
Random Forest,0.333485,0.820933,0.68272,0.924427,0.979915
Neural Network (MLP),0.13165,0.92931,0.891839,0.975604,0.988613
XGBoost,0.123201,0.933846,0.909743,0.980853,0.991454
Random Forest (Tuned),0.083713,0.95505,0.937513,0.985088,0.991758
