In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
df = pd.read_csv("./Food_updated_1.csv")

# Normalize ingredient names to lowercase
df['Ingredient_Name'] = df['Ingredient_Name'].str.lower()

# Handle missing values and create the target column
df['Health_Risk_Score'] = pd.to_numeric(df['Health_Risk_Score'], errors='coerce').fillna(0)
df['Recommended_Daily_Intake'] = df['Health_Risk_Score'].apply(lambda x: 0 if x > 3 else 1)
df['target'] = df['Health_Risk_Score'].apply(lambda x: 0 if x >= 4 else 1)

# Encode the ingredient names
label_encoder = LabelEncoder()
df['Ingredient_Name_Encoded'] = label_encoder.fit_transform(df['Ingredient_Name'])

# Vectorize the description
vectorizer = TfidfVectorizer()
description_vectors = vectorizer.fit_transform(df['Description']).toarray()

# Define features and targets
X = np.hstack((df['Ingredient_Name_Encoded'].values.reshape(-1, 1), description_vectors))
y_health_risk_score = df['Health_Risk_Score']
y_target = df['target']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data for Health_Risk_Score and target models
X_train_health, X_test_health, y_health_train, y_health_test = train_test_split(X, y_health_risk_score, test_size=0.2, random_state=2)
X_train_target, X_test_target, y_target_train, y_target_test = train_test_split(X, y_target, test_size=0.2, random_state=2)

# Handle class imbalance for target prediction
smote = SMOTE(k_neighbors=min(5, len(np.unique(y_target_train))-1), random_state=2)
X_train_target, y_target_train = smote.fit_resample(X_train_target, y_target_train)

# Train the regression model for Health_Risk_Score with hyperparameter tuning
regressor = RandomForestRegressor()
param_grid_regressor = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_search_regressor = GridSearchCV(estimator=regressor, param_grid=param_grid_regressor, cv=5)
grid_search_regressor.fit(X_train_health, y_health_train)
best_regressor = grid_search_regressor.best_estimator_

# Train the classification model for target with hyperparameter tuning
classifier = RandomForestClassifier()
param_grid_classifier = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_search_classifier = GridSearchCV(estimator=classifier, param_grid=param_grid_classifier, cv=3)
grid_search_classifier.fit(X_train_target, y_target_train)
best_classifier = grid_search_classifier.best_estimator_

# Evaluate models with cross-validation
cv_health_scores = cross_val_score(best_regressor, X_train_health, y_health_train, cv=5, scoring='neg_mean_squared_error')
cv_health_rmse = np.sqrt(-cv_health_scores.mean())

health_pred = best_regressor.predict(X_test_health)
target_pred = best_classifier.predict(X_test_target)

# Calculate metrics
health_rmse = np.sqrt(mean_squared_error(y_health_test, health_pred))
target_accuracy = accuracy_score(y_target_test, target_pred)

print(f"Health Risk Score RMSE (CV): {cv_health_rmse}")
print(f"Health Risk Score RMSE: {health_rmse}")
print(f"Target Prediction Accuracy: {target_accuracy}")

# Save models and other objects
joblib.dump(best_regressor, 'health_risk_score_model.pkl')
joblib.dump(best_classifier, 'target_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(df[['Ingredient_Name', 'Description']], 'ingredient_data.pkl')  # Save ingredient names and descriptions

print("Models and objects are saved")


Health Risk Score RMSE (CV): 0.3963410608474183
Health Risk Score RMSE: 0.4310247515741225
Target Prediction Accuracy: 0.9919354838709677
Models and objects are saved
