In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config

import pickle


In [2]:
def read_csv(file_path):
    return pd.read_csv(file_path)

def dataset_info_statistics(data):
    print("Dataset Information:")
    print(data.info())
    print("\n")
    print("Basic Statistics for Numerical Columns:")
    print(data.describe())
    print("\n")

def check_null(data):
    null_counts = data.isnull().sum()
    print("Null Values in the Dataset:")
    return null_counts

def check_duplicates(data):
    return data.duplicated().any()

def plot_graph(data):
    numerical_columns = data.select_dtypes(include=np.number).columns
    for column in numerical_columns:
        plt.figure(figsize=(5, 3))
        sns.distplot(data[column], kde=True)
        plt.title(f"Histogram for {column}")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.show()
        
    categorical_columns = data.select_dtypes(include='object').columns
    for column in categorical_columns:
        plt.figure(figsize=(5, 3))
        sns.countplot(data[column])
        plt.title(f'Countplot for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

def seperate_features_target(data, target_column):
    X = data.drop(columns=[target_column], axis=1)
    y = data[target_column]
    return X, y

def perform_train_test_split(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [3]:
calories = read_csv('calories.csv')
exercise = read_csv('exercise.csv')
data = pd.merge(calories, exercise, on='User_ID')

dataset_info_statistics(data)
check_null(data)

X, y = seperate_features_target(data, 'Calories')
X = X.drop(columns=['User_ID'])

X_train, X_test, y_train, y_test = perform_train_test_split(X, y, test_size=0.20, random_state=42)


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Calories    15000 non-null  float64
 2   Gender      15000 non-null  object 
 3   Age         15000 non-null  int64  
 4   Height      15000 non-null  float64
 5   Weight      15000 non-null  float64
 6   Duration    15000 non-null  float64
 7   Heart_Rate  15000 non-null  float64
 8   Body_Temp   15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB
None


Basic Statistics for Numerical Columns:
            User_ID      Calories           Age        Height        Weight  \
count  1.500000e+04  15000.000000  15000.000000  15000.000000  15000.000000   
mean   1.497736e+07     89.539533     42.789800    174.465133     74.966867   
std    2.872851e+06     62.456978     16.980264     14.258114     15.03565

In [4]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(), ['Gender']),
    ('num', StandardScaler(), ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']),
], remainder='passthrough')

# Linear Regression Model
pipeline_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Random Forest Regressor Model
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])


In [5]:
# Linear Regression
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results_lr = cross_val_score(pipeline_lr, X, y, cv=kfold, scoring='r2')
cv_mean_lr = cv_results_lr.mean()

print(f"Linear Regression - R^2: {r2_lr}, MAE: {mae_lr}, CV Mean R^2: {cv_mean_lr}")

# Random Forest Regressor
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

cv_results_rf = cross_val_score(pipeline_rf, X, y, cv=kfold, scoring='r2')
cv_mean_rf = cv_results_rf.mean()

print(f"Random Forest Regressor - R^2: {r2_rf}, MAE: {mae_rf}, CV Mean R^2: {cv_mean_rf}")


Linear Regression - R^2: 0.9672937151257295, MAE: 8.441513553849706, CV Mean R^2: 0.9671402283675841
Random Forest Regressor - R^2: 0.9982405761530181, MAE: 1.6982566666666667, CV Mean R^2: 0.9979377270390046


In [6]:
sample = pd.DataFrame({
    'Gender': 'male',
    'Age': 68,
    'Height': 190.0,
    'Weight': 94.0,
    'Duration': 29.0,
    'Heart_Rate': 105.0,
    'Body_Temp': 40.8,
}, index=[0])

# Predict with Random Forest Regressor
result_rf = pipeline_rf.predict(sample)
print(f"Prediction (Random Forest): {result_rf}")


Prediction (Random Forest): [229.97]


In [7]:
with open('pipeline_rf.pkl', 'wb') as f:
    pickle.dump(pipeline_rf, f)

with open('pipeline_rf.pkl', 'rb') as f:
    pipeline_rf_saved = pickle.load(f)

result_saved_rf = pipeline_rf_saved.predict(sample)
print(f"Saved Model Prediction (Random Forest): {result_saved_rf}")


Saved Model Prediction (Random Forest): [229.97]


In [8]:
import tkinter as tk
from tkinter import ttk
import pandas as pd
import pickle

# Load the saved model
with open('pipeline_rf.pkl', 'rb') as f:
    model = pickle.load(f)

# Function to make predictions
def make_prediction():
    gender = gender_var.get()
    age = int(age_var.get())
    height = float(height_var.get())
    weight = float(weight_var.get())
    duration = float(duration_var.get())
    heart_rate = float(heart_rate_var.get())
    body_temp = float(body_temp_var.get())

    sample = pd.DataFrame({
        'Gender': [gender],
        'Age': [age],
        'Height': [height],
        'Weight': [weight],
        'Duration': [duration],
        'Heart_Rate': [heart_rate],
        'Body_Temp': [body_temp]
    })

    prediction = model.predict(sample)[0]
    result_var.set(f'Predicted Calories: {prediction:.2f}')

# Create the main window
root = tk.Tk()
root.title("Calorie Prediction")

# Create input fields
tk.Label(root, text="Gender (male/female):").grid(row=0, column=0)
gender_var = tk.StringVar()
tk.Entry(root, textvariable=gender_var).grid(row=0, column=1)

tk.Label(root, text="Age:").grid(row=1, column=0)
age_var = tk.StringVar()
tk.Entry(root, textvariable=age_var).grid(row=1, column=1)

tk.Label(root, text="Height (cm):").grid(row=2, column=0)
height_var = tk.StringVar()
tk.Entry(root, textvariable=height_var).grid(row=2, column=1)

tk.Label(root, text="Weight (kg):").grid(row=3, column=0)
weight_var = tk.StringVar()
tk.Entry(root, textvariable=weight_var).grid(row=3, column=1)

tk.Label(root, text="Duration (mins):").grid(row=4, column=0)
duration_var = tk.StringVar()
tk.Entry(root, textvariable=duration_var).grid(row=4, column=1)

tk.Label(root, text="Heart Rate:").grid(row=5, column=0)
heart_rate_var = tk.StringVar()
tk.Entry(root, textvariable=heart_rate_var).grid(row=5, column=1)

tk.Label(root, text="Body Temperature (°C):").grid(row=6, column=0)
body_temp_var = tk.StringVar()
tk.Entry(root, textvariable=body_temp_var).grid(row=6, column=1)

# Button to make predictions
tk.Button(root, text="Predict", command=make_prediction).grid(row=7, column=0, columnspan=2)

# Label to display the result
result_var = tk.StringVar()
tk.Label(root, textvariable=result_var).grid(row=8, column=0, columnspan=2)

# Run the GUI event loop
root.mainloop()
