In [12]:
import pandas as pd 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [13]:
# load the dataset
df = pd.read_csv('calories_dataset_50.csv')

# first 5 rows
print(df.head())

   pushups  squats  pullups  weight_lifts  jogging_km  calories_burned
0       45      44       12            16         3.0              359
1       31      42        3            24         5.1              406
2       56      44        5            44         4.7              461
3       82      39       15            37         1.3              374
4       78      46        2            45         0.9              296


In [14]:
# check correlation
print(df.corr())

                  pushups    squats  ...  jogging_km  calories_burned
pushups          1.000000 -0.196383  ...   -0.152274         0.153785
squats          -0.196383  1.000000  ...   -0.002076         0.249696
pullups          0.060066 -0.163426  ...    0.001267         0.121882
weight_lifts    -0.028553  0.024491  ...   -0.183780         0.155287
jogging_km      -0.152274 -0.002076  ...    1.000000         0.799469
calories_burned  0.153785  0.249696  ...    0.799469         1.000000

[6 rows x 6 columns]


In [15]:
# x_train and y_train
X = df[["pushups", "squats", "pullups", "weight_lifts", "jogging_km"]]
y = df["calories_burned"]


print(X)
print(y)

# check the shape of the data
print(X.shape)
print(y.shape)


    pushups  squats  pullups  weight_lifts  jogging_km
0        45      44       12            16         3.0
1        31      42        3            24         5.1
2        56      44        5            44         4.7
3        82      39       15            37         1.3
4        78      46        2            45         0.9
5        77     115       19            28         2.1
6        30      56       13            49         5.6
7        21      72       10            13         4.4
8        58      57       13            15         2.4
9        87      61       19            36         3.6
10       23     106       16            14         3.2
11       61     106        7            57         4.7
12       57      93        3            23         5.7
13       98      33        7            31         3.2
14       53      77       14            18         2.3
15       69      97       15            21         3.7
16       34      71       14            36         3.8
17       3

In [19]:
# split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(40, 5)
(40,)
(10, 5)
(10,)


In [20]:
# train the model
model_calories = LinearRegression()
model_calories.fit(X_train, y_train)

In [None]:
# parameters of the model
print(model_calories.intercept_) # bias
print(model_calories.coef_) # weights

-21.50132615965964
[ 1.51482636  1.28911348  3.11698439  2.06934221 52.40693307]


In [22]:
# predict the calories burned
y_pred = model_calories.predict(X_valid)
print(y_pred)

[423.16308671 335.78974542 346.91248055 327.28829437 361.98056094
 263.49345214 381.50099514 429.02784753 622.52583525 513.72338291]


In [23]:
# calculate mean absolute error
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_valid, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 25.397216297331276


In [28]:
# new predictions
new_data = [[20, 30, 10, 15, 5]]
columns = df.columns[:-1]
print(columns)
# create a DataFrame for new data to avoid the warning
new_data = pd.DataFrame(new_data, columns=columns)
new_prediction = model_calories.predict(new_data)
print("New prediction:", new_prediction)

Index(['pushups', 'squats', 'pullups', 'weight_lifts', 'jogging_km'], dtype='object')
New prediction: [371.71324786]
