In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load the dataset
data = pd.read_csv('bmiprediction.csv')

# Handling missing values: Fill missing values with the mean of the column
data.fillna(data.mean(), inplace=True)

# Separate features and target variable
X = data.drop(columns=['BMI'])
y = data['BMI']

# Log transform the target variable to reduce skewness
y_log = np.log1p(y)

# Identify numerical columns (all columns in this case)
numerical_features = X.columns.tolist()

# Preprocess the data: StandardScaling for numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ensure the preprocessed data is dense
X_scaled_dense = X_scaled if isinstance(X_scaled, np.ndarray) else X_scaled.toarray()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_dense, y_log, test_size=0.2, random_state=42)

# Define KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Convert y_train to a numpy array
y_train = y_train.to_numpy()

# Create a function to build the model
# def build_model():
#     model = Sequential([
#         Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
#         Dropout(0.3),
#         BatchNormalization(),
#         Dense(128, activation='tanh', kernel_regularizer=l2(0.01)),
#         Dropout(0.3),
#         BatchNormalization(),
#         Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
#         Dropout(0.3),
#         Dense(1)
#     ])
#     optimizer = Adam(learning_rate=0.001)
#     model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
#     return model

# # Train and evaluate the model using cross-validation
# mse_scores = []
# mae_scores = []

# for train_index, val_index in kf.split(X_train):
#     X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#     y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
#     model = build_model()
    
#     early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
#     reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)
    
#     history = model.fit(X_train_fold, y_train_fold, epochs=200, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping, reduce_lr], verbose=0)
    
#     val_mse, val_mae = model.evaluate(X_val_fold, y_val_fold, verbose=0)
#     mse_scores.append(val_mse)
#     mae_scores.append(val_mae)

# # Average scores
# avg_mse = np.mean(mse_scores)
# avg_mae = np.mean(mae_scores)

# print(f"Average Mean Squared Error: {avg_mse}")
# print(f"Average Mean Absolute Error: {avg_mae}")

# # Train the final model on the entire training set
# model = build_model()
# history = model.fit(X_train, y_train, epochs=1000, validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr], verbose=0)

# # Evaluate the model on the test data
# test_mse, test_mae = model.evaluate(X_test, y_test, verbose=0)

# print("Test Mean Squared Error:", test_mse)
# print("Test Mean Absolute Error:", test_mae)

# # Predict individually for each row in the dataset
# predictions_exp = []

# for i in range(X.shape[0]):
#     sample_data = X.iloc[i:i+1, :]  # Take one row at a time
#     sample_data_scaled = scaler.transform(sample_data)
#     sample_prediction = model.predict(sample_data_scaled)
#     sample_prediction_exp = np.expm1(sample_prediction)
#     predictions_exp.append(sample_prediction_exp[0][0])

# # Inverse log transform the actual values
# y_exp = np.expm1(y_log)

# # Create a DataFrame for actual vs predicted values
# comparison_df = pd.DataFrame({'Actual': y_exp, 'Predicted': predictions_exp})

# # Define the function to plot and generate table
# def plot_and_table():
#     plt.figure(figsize=(10, 6))
#     plt.plot(comparison_df['Actual'], label='Actual BMPI')
#     plt.plot(comparison_df['Predicted'], label='Predicted BMPI')
#     plt.xlabel(" Dataset Index")
#     plt.ylabel("BMPI")
#     plt.title("Actual vs. Predicted BMPI")
#     plt.legend()
#     plt.show()
    
#     print(comparison_df)

# # Call the function to plot and print the table
# plot_and_table()

array([[-1.34258533e+00, -2.54148127e-01, -2.36183364e-01,
        -1.99336874e-01],
       [-1.94073581e-02,  4.57935878e-02, -1.04808843e-01,
        -4.41592873e-01],
       [-7.26149301e-01, -2.33159541e-01, -5.86607909e-01,
        -4.33925667e-01],
       [ 1.43333997e+00, -1.35154104e-01,  1.13100291e-01,
        -5.86168025e-01],
       [-2.54988006e-01, -1.47859833e-01, -2.84452908e-01,
        -5.10010739e-01],
       [ 7.26598026e-01, -4.86227236e-02, -8.47086749e-02,
        -4.24272925e-01],
       [-3.33514888e-01, -1.80980112e-01, -1.69287750e-01,
        -3.96840455e-01],
       [ 1.11923244e+00, -2.19530894e-01, -9.99785946e-02,
        -5.32664539e-01],
       [ 9.85736738e-01, -1.70343145e-01, -1.32450533e-01,
         2.39636893e+00],
       [-3.31753642e+00,  6.12868478e+00,  6.04377431e+00,
        -1.12446971e+00],
       [-1.27976382e+00, -2.57543307e-01, -4.06964161e-01,
         3.45850922e-01],
       [ 5.69544261e-01, -2.19303508e-01, -2.01322129e-02,
      

In [2]:
model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step


array([[-0.22118059],
       [ 0.00630469],
       [-0.10254487],
       [-0.05461688],
       [-0.22526953],
       [-0.11517841],
       [-0.05912982],
       [-0.09314869]], dtype=float32)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load the dataset
file_path = "bmiprediction.csv"
data = pd.read_csv(file_path)

# Handling missing values: Fill missing values with the mean of the column
data.fillna(data.mean(), inplace=True)

# Separate features and target variable
X = data.drop(columns=['BMI'])
y = data['BMI']

# Log transform the target variable to reduce skewness
y = np.log1p(y)

# Identify numerical columns (all columns in this case)
numerical_features = X.columns.tolist()

# Preprocess the data: StandardScaling for numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ensure the preprocessed data is dense
X_scaled_dense = X_scaled if isinstance(X_scaled, np.ndarray) else X_scaled.toarray()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_dense, y, test_size=0.2, random_state=42)

# Define KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Convert y_train to a numpy array
# y_train = y_train.to_numpy()

# Create a function to build the model
# def build_model():
#     model = Sequential([
#     Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
#      Dropout(0.3),
#      BatchNormalization(),
#      Dense(128, activation='tanh', kernel_regularizer=l2(0.01)),
#      Dropout(0.3),
#      BatchNormalization(),
#      Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
#      Dropout(0.3),
#       Dense(1)
#   ])
#     optimizer = Adam(learning_rate=0.001)
#     model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
#     return model

# # Train and evaluate the model using cross-validation
# mse_scores = []
# mae_scores = []

# for train_index, val_index in kf.split(X_train):
#     X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#     y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

#     model = build_model()

#     early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
#     reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)

#     history = model.fit(X_train_fold, y_train_fold, epochs=1000, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping, reduce_lr], verbose=0)

#     val_mse, val_mae = model.evaluate(X_val_fold, y_val_fold, verbose=0)
#     mse_scores.append(val_mse)
#     mae_scores.append(val_mae)

# # Average scores
# avg_mse = np.mean(mse_scores)
# avg_mae = np.mean(mae_scores)

# print(f"Average Mean Squared Error: {avg_mse}")
# print(f"Average Mean Absolute Error: {avg_mae}")

# Evaluate the model on the test data
# model = build_model()
# history = model.fit(X_train, y_train, epochs=1000, validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr], verbose=0)
# test_mse, test_mae = model.evaluate(X_test, y_test, verbose=0)

# print("Test Mean Squared Error:", test_mse)
# print("Test Mean Absolute Error:", test_mae)

# # Predict on train and test data
# y_train_pred = model.predict(X_train).flatten()
# y_test_pred = model.predict(X_test).flatten()

# Inverse log transform the predictions and the actual values
# y_train_exp = np.expm1(y_train)
# y_train_pred_exp = np.expm1(y_train_pred)
# y_test_exp = np.expm1(y_test)
# y_test_pred_exp = np.expm1(y_test_pred)

# # Create a DataFrame for actual vs predicted values for training set
# train_comparison_df = pd.DataFrame({'Actual': y_train_exp, 'Predicted': y_train_pred_exp})

# # Create a DataFrame for actual vs predicted values for test set
# test_comparison_df = pd.DataFrame({'Actual': y_test_exp, 'Predicted': y_test_pred_exp})

# # Concatenate train and test comparison DataFrames
# comparison_df = pd.concat([train_comparison_df, test_comparison_df], keys=['Train', 'Test'])

# # Calculate accuracy for each prediction (1 - absolute percentage error)
# accuracy_train = 1 - np.abs((y_train_exp - y_train_pred_exp) / y_train_exp)
# accuracy_test = 1 - np.abs((y_test_exp - y_test_pred_exp) / y_test_exp)

# # Average accuracy
# average_accuracy_train = np.mean(accuracy_train)
# average_accuracy_test = np.mean(accuracy_test)
# average_accuracy = np.mean([average_accuracy_train, average_accuracy_test])

# print("Average Accuracy (Training):", average_accuracy_train)
# print("Average Accuracy (Testing):", average_accuracy_test)
# print("Overall Average Accuracy:", average_accuracy)

# # Visualize actual vs. predicted values
# plt.figure(figsize=(10, 6))
# plt.scatter(y_test_exp, y_test_pred_exp, alpha=0.5)
# plt.xlabel("Actual BMI")
# plt.ylabel("Predicted BMI")
# plt.title("Actual vs. Predicted BMI")
# plt.plot([y_test_exp.min(), y_test_exp.max()], [y_test_exp.min(), y_test_exp.max()], 'k--', lw=2)
# plt.show()

# # Plot training history
# history_dict = history.history

# # Check if 'loss' and 'val_loss' are in the history dictionary
# if 'loss' in history_dict and 'val_loss' in history_dict:
#     loss_values = history_dict['loss']
#     val_loss_values = history_dict['val_loss']
#     epochs = range(1, len(loss_values) + 1)

#     plt.figure(figsize=(10, 6))
#     plt.plot(epochs, loss_values, 'bo', label='Training loss')
#     plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
#     plt.title('Training and validation loss')
#     plt.xlabel('Epochs')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.show()
# else:
#     print("No loss data found in history.")

# # Print comparison table
# print(comparison_df)

# # Save the comparison table to a CSV file
# comparison_df.to_csv("actual_vs_predicted_bmi.csv")

# # Test the model with a sample data
# sample_data = np.array([[2.5, 500, 300, 1200000]])
# sample_data_scaled = scaler.transform(sample_data)
# sample_prediction = model.predict(sample_data_scaled)
# sample_prediction_exp = np.expm1(sample_prediction)
# print("Sample Data Prediction (after inverse transform):", sample_prediction_exp)
y



0     4.304065
1     4.369448
2     4.398146
3     4.421247
4     4.409155
5     4.460144
6     4.513055
7     4.535820
8     4.535820
9     4.623010
10    4.614130
11    4.637637
12    4.623992
13    4.627910
14    4.641502
15    4.638605
16    4.638605
17    4.662495
18    4.641502
19    4.696837
20    4.694096
21    4.653008
22    4.643429
23    4.624973
24    4.615121
25    4.497585
26    4.422449
27    4.675442
28    4.556610
29    4.542976
30    4.585885
31    4.628789
32    4.644295
33    4.565597
34    4.354013
35    4.568207
36    4.568207
37    4.568207
38    4.568207
Name: BMI, dtype: float64

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb


kNeighbors = KNeighborsRegressor()
svm = SVR()
LinearRegression = LassoCV()
randomForest = RandomForestRegressor()
xgboost = xgb.XGBRegressor(learning_rate=0.199999999999, n_estimators=184)

models = [{'name': "kNeighbors", "model": kNeighbors},
          {"name": "support vector machine", "model": svm},

          {"name": "LassoCV regression", "model": LinearRegression},
          {"name": "random forest", "model": randomForest
           }, {"name": "xgboost", "model": xgboost}]




randomForest.fit(X_train,y_train)
y_preds = randomForest.predict(X_test)

# pd.DataFrame({'y_pred': np.expm1(y_preds), 'y_test': np.expm1(y_test)})

result_dict = []
sum = 0
avg = 0
for i in range(0,5):
    
    score = randomForest.score(X_test, y_test)
    sum+= score
    result_dict.append(score)
    avg = sum/ len(result_dict)

print(avg)    

0.55918546649421


In [48]:
import pickle

with open('model1.pkl', 'wb') as f:
    pickle.dump(randomForest, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [4]:
X_test

array([[ 6.79481896e-01, -1.14901177e-01, -1.68083513e-01,
         2.24937496e+00],
       [ 4.35911765e-17,  1.26719381e-16, -1.80044481e-16,
         0.00000000e+00],
       [ 7.65861467e-01, -1.55003532e-01,  4.14415938e-01,
        -8.99529023e-01],
       [ 7.26598026e-01, -1.96522588e-01, -2.77747573e-01,
        -5.06669711e-01],
       [-2.51061662e-01, -2.38503104e-01, -2.44269992e-01,
         1.42925560e+00],
       [-9.10687475e-01, -3.11580834e-01, -4.16347196e-01,
         4.86981033e-01],
       [ 6.48071143e-01, -8.76929259e-02,  9.94932440e-02,
        -6.81081575e-01],
       [-8.94982098e-01, -2.94699680e-01, -1.16692838e-01,
         6.21604652e-01]])