<a href="https://colab.research.google.com/github/BRV12G/Final_year_Project/blob/main/fitness_random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
# Assuming you have the dataset as a CSV file
# Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv('/content/updated_cleaned_diet_split_exercises.csv')

# --------- Preprocessing Categorical Inputs ---------

# Label encoding for categorical input columns
categorical_inputs = [
    'Gender', 'Hypertension', 'Diabetes', 'BMI_Class',
    'Fitness_Goal', 'Fitness_Type'
]

label_encoders = {}
for col in categorical_inputs:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for decoding later

# --------- Preprocessing Numeric Inputs ---------

# Feature engineering: Calculating BMI (BMI = weight in kg / (height in m)^2)
df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)  # Corrected height in cm to meters

# Scaling numeric inputs
numeric_inputs = ['Age', 'Height', 'Weight', 'BMI', 'Duration_of_Workout']
scaler = StandardScaler()
df[numeric_inputs] = scaler.fit_transform(df[numeric_inputs])

# --------- Preprocessing Categorical Outputs ---------

# Encoding categorical output columns
categorical_outputs = [
    'Exercise1', 'Exercise2', 'Exercise3',
    'Equipment1', 'Equipment2',
    'd_vegetables', 'd_juice', 'd_proteinintake'
]

output_label_encoders = {}
for col in categorical_outputs:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    output_label_encoders[col] = le

# --------- Preprocessing Numeric Outputs ---------

# Scaling numeric output columns
numeric_outputs = ['Calories_Burnt', 'Water_Intake(Litres)']
df[numeric_outputs] = scaler.fit_transform(df[numeric_outputs])

# --------- Splitting the Dataset ---------

# Define input (X) and output (y) columns
input_columns = categorical_inputs + numeric_inputs
output_columns = categorical_outputs + numeric_outputs

X = df[input_columns]
y_categorical = df[categorical_outputs]
y_numeric = df[numeric_outputs]
# y = df[output_columns]

# Splitting into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting into training and testing sets
X_train, X_test, y_train_categorical, y_test_categorical = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42
)

_, _, y_train_numeric, y_test_numeric = train_test_split(
    X, y_numeric, test_size=0.2, random_state=42
)

# # --------- Display Processed Data ---------
# print("Processed Input Features:")
# print(X_train.head())
# print("\nProcessed Outputs:")
# print(y_train.head())

# Save processed data to CSV files if needed
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)
# y_test.to_csv('y_test.csv', index=False)
y_train_categorical.to_csv('y_train_categorical.csv', index=False)
y_test_categorical.to_csv('y_test_categorical.csv', index=False)
y_train_numeric.to_csv('y_train_numeric.csv', index=False)
y_test_numeric.to_csv('y_test_numeric.csv', index=False)

print("\nPreprocessing completed!")


    # Replace `y_train_numeric` with relevant data





from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Define models
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
regressor = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Fit models
classifier.fit(X_train, y_train_categorical)
regressor.fit(X_train, y_train_numeric)





from sklearn.metrics import accuracy_score, classification_report

# Predictions
y_pred_categorical = classifier.predict(X_test)

# Evaluation
# print("Classification Report:\n", classification_report(y_test_categorical, y_pred_categorical))

# Evaluate each categorical output column
for col_idx, col_name in enumerate(y_test_categorical.columns):
    print(f"Classification Report for {col_name}:\n")
    print(classification_report(
        y_test_categorical[col_name],
        y_pred_categorical[:, col_idx]  # Predicted values for this column
    ))
    print("-" * 50)


from sklearn.metrics import accuracy_score

accuracies = []
for col_idx, col_name in enumerate(y_test_categorical.columns):
    acc = accuracy_score(y_test_categorical[col_name], y_pred_categorical[:, col_idx])
    accuracies.append(acc)
    print(f"Accuracy for {col_name}: {acc:.2f}")

average_accuracy = np.mean(accuracies)
print(f"\nAverage Accuracy Across Outputs: {average_accuracy:.2f}")


from sklearn.metrics import mean_squared_error, r2_score

# Predictions
y_pred_numeric = regressor.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test_numeric, y_pred_numeric)
r2 = r2_score(y_test_numeric, y_pred_numeric)

print("Mean Squared Error:", mse)
print("R² Score:", r2)




Preprocessing completed!
Classification Report for Exercise1:

              precision    recall  f1-score   support

           0       0.18      0.22      0.20       500
           1       0.19      0.19      0.19       510
           2       0.19      0.17      0.18       479
           3       0.17      0.15      0.16       493
           4       0.15      0.15      0.15       463
           5       0.13      0.12      0.12       473

    accuracy                           0.17      2918
   macro avg       0.17      0.17      0.17      2918
weighted avg       0.17      0.17      0.17      2918

--------------------------------------------------
Classification Report for Exercise2:

              precision    recall  f1-score   support

           0       0.15      0.17      0.16       463
           1       0.18      0.22      0.20       500
           2       0.17      0.15      0.16       493
           3       0.13      0.13      0.13       473
           4       0.19      0.16

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Load the dataset
df = pd.read_csv('/content/updated_cleaned_diet_split_exercises.csv')

# --------- Preprocessing Categorical Inputs ---------
categorical_inputs = [
    'Gender', 'Hypertension', 'Diabetes', 'BMI_Class',
    'Fitness_Goal', 'Fitness_Type'
]

label_encoders = {}
for col in categorical_inputs:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for decoding later

# --------- Preprocessing Numeric Inputs ---------
df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)  # Corrected height in cm to meters

numeric_inputs = ['Age', 'Height', 'Weight', 'BMI', 'Duration_of_Workout']
scaler = StandardScaler()
df[numeric_inputs] = scaler.fit_transform(df[numeric_inputs])

# --------- Preprocessing Categorical Outputs ---------
categorical_outputs = [
    'Exercise1', 'Exercise2', 'Exercise3',
    'Equipment1', 'Equipment2',
    'd_vegetables', 'd_juice', 'd_proteinintake'
]

output_label_encoders = {}
for col in categorical_outputs:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    output_label_encoders[col] = le

# --------- Preprocessing Numeric Outputs ---------
numeric_outputs = ['Calories_Burnt', 'Water_Intake(Litres)']
df[numeric_outputs] = scaler.fit_transform(df[numeric_outputs])

# --------- Splitting the Dataset ---------
input_columns = categorical_inputs + numeric_inputs
output_columns = categorical_outputs + numeric_outputs

X = df[input_columns]
y_categorical = df[categorical_outputs]
y_numeric = df[numeric_outputs]

X_train, X_test, y_train_categorical, y_test_categorical = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42
)

_, _, y_train_numeric, y_test_numeric = train_test_split(
    X, y_numeric, test_size=0.2, random_state=42
)

# --------- Model Definitions ---------
# Define models with hyperparameter tuning options
classifier = MultiOutputClassifier(
    RandomForestClassifier(
        random_state=42,
        n_estimators=100,
        max_depth=10,
        min_samples_split=4
    )
)

regressor = MultiOutputRegressor(
    RandomForestRegressor(
        random_state=42,
        n_estimators=100,
        max_depth=10,
        min_samples_split=4
    )
)

# --------- Hyperparameter Tuning Using GridSearchCV ---------
# For the classifier
param_grid_classifier = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [10, 20, None],
    'estimator__min_samples_split': [2, 4, 6]
}

grid_search_classifier = GridSearchCV(
    classifier, param_grid_classifier, cv=3, n_jobs=-1, scoring='accuracy'
)
grid_search_classifier.fit(X_train, y_train_categorical)

# For the regressor
param_grid_regressor = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [10, 20, None],
    'estimator__min_samples_split': [2, 4, 6]
}

grid_search_regressor = GridSearchCV(
    regressor, param_grid_regressor, cv=3, n_jobs=-1, scoring='neg_mean_squared_error'
)
grid_search_regressor.fit(X_train, y_train_numeric)

# --------- Model Evaluation ---------
# Best model after grid search for classifier and regressor
classifier_best = grid_search_classifier.best_estimator_
regressor_best = grid_search_regressor.best_estimator_

# Predictions
y_pred_categorical = classifier_best.predict(X_test)
y_pred_numeric = regressor_best.predict(X_test)

# --------- Classification Evaluation ---------
for col_idx, col_name in enumerate(y_test_categorical.columns):
    print(f"Classification Report for {col_name}:\n")
    print(classification_report(
        y_test_categorical[col_name],
        y_pred_categorical[:, col_idx]  # Predicted values for this column
    ))
    print("-" * 50)

# Calculate accuracy for each column
accuracies = []
for col_idx, col_name in enumerate(y_test_categorical.columns):
    acc = accuracy_score(y_test_categorical[col_name], y_pred_categorical[:, col_idx])
    accuracies.append(acc)
    print(f"Accuracy for {col_name}: {acc:.2f}")

average_accuracy = np.mean(accuracies)
print(f"\nAverage Accuracy Across Outputs: {average_accuracy:.2f}")

# --------- Regression Evaluation ---------
mse = mean_squared_error(y_test_numeric, y_pred_numeric)
r2 = r2_score(y_test_numeric, y_pred_numeric)

print("\nMean Squared Error:", mse)
print("R² Score:", r2)

# --------- Cross-validation ---------
cross_val_score_classifier = cross_val_score(classifier_best, X, y_categorical, cv=3, scoring='accuracy')
cross_val_score_regressor = cross_val_score(regressor_best, X, y_numeric, cv=3, scoring='neg_mean_squared_error')

print(f"Cross-Validation Accuracy (Classifier): {np.mean(cross_val_score_classifier):.2f}")
print(f"Cross-Validation R² (Regressor): {np.mean(cross_val_score_regressor):.2f}")

# --------- Save processed data ---------
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train_categorical.to_csv('y_train_categorical.csv', index=False)
y_test_categorical.to_csv('y_test_categorical.csv', index=False)
y_train_numeric.to_csv('y_train_numeric.csv', index=False)
y_test_numeric.to_csv('y_test_numeric.csv', index=False)

print("\nPreprocessing and model training completed!")


 nan nan nan nan nan nan nan nan nan]


Classification Report for Exercise1:

              precision    recall  f1-score   support

           0       0.17      0.30      0.22       500
           1       0.16      0.16      0.16       510
           2       0.17      0.11      0.14       479
           3       0.15      0.10      0.12       493
           4       0.15      0.12      0.13       463
           5       0.16      0.18      0.17       473

    accuracy                           0.16      2918
   macro avg       0.16      0.16      0.16      2918
weighted avg       0.16      0.16      0.16      2918

--------------------------------------------------
Classification Report for Exercise2:

              precision    recall  f1-score   support

           0       0.15      0.12      0.13       463
           1       0.17      0.30      0.22       500
           2       0.15      0.10      0.12       493
           3       0.16      0.18      0.17       473
           4       0.17      0.11      0.14       479
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py", line 

Cross-Validation Accuracy (Classifier): nan
Cross-Validation R² (Regressor): -0.64

Preprocessing and model training completed!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE

# Load the dataset (adjust path as necessary)
df = pd.read_csv('/content/updated_cleaned_diet_split_exercises.csv')

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Define features and target variables
X = df_imputed.drop(['Exercise1', 'Exercise2', 'Exercise3', 'Equipment1', 'Equipment2', 'd_vegetables', 'd_juice', 'd_proteinintake'], axis=1)
y = df_imputed[['Exercise1', 'Exercise2', 'Exercise3', 'Equipment1', 'Equipment2', 'd_vegetables', 'd_juice', 'd_proteinintake']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_scaled, y_train_smote)

# Get the best model
best_rf = random_search.best_estimator_

# Predict on the test set
y_pred = best_rf.predict(X_test_scaled)

# Classification Report for each output (Exercise1, Exercise2, Exercise3, etc.)
for column in y.columns:
    print(f"Classification Report for {column}:\n")
    print(classification_report(y_test[column], y_pred[:, y.columns.get_loc(column)]))
    print('-' * 50)

# Overall accuracy for each output
for column in y.columns:
    accuracy = accuracy_score(y_test[column], y_pred[:, y.columns.get_loc(column)])
    print(f"Accuracy for {column}: {accuracy:.2f}")

# Overall model performance (Accuracy, Mean Squared Error, and R² Score)
avg_accuracy = accuracy_score(y_test, y_pred)  # Overall accuracy
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nAverage Accuracy Across Outputs: {avg_accuracy:.2f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

# Optionally, handle class weights for RandomForestClassifier if the imbalance is still present
# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_smote), y=y_train_smote)
# rf = RandomForestClassifier(class_weight='balanced', random_state=42)


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'Male'