In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn import metrics

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
calories = pd.read_csv('calories.csv')

In [3]:
calories.head()

Unnamed: 0,User_ID,Calories
0,14733363,231.0
1,14861698,66.0
2,11179863,26.0
3,16180408,71.0
4,17771927,35.0


In [4]:
exercise = pd.read_csv('exercise.csv')

In [5]:
exercise.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


Combining the two data frames

In [6]:
df = pd.concat([exercise,calories['Calories']],axis=1)

In [7]:
df.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


In [8]:
df.isnull().sum()

User_ID       0
Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,User_ID,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,14977360.0,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453,89.539533
std,2872851.0,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923,62.456978
min,10001160.0,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,12474190.0,28.0,164.0,63.0,8.0,88.0,39.6,35.0
50%,14997280.0,39.0,175.0,74.0,16.0,96.0,40.2,79.0
75%,17449280.0,56.0,185.0,87.0,23.0,103.0,40.6,138.0
max,19999650.0,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [10]:
label_encode = LabelEncoder()

In [11]:
label_gender = label_encode.fit_transform(df.Gender)

In [12]:
df['Gender'] = label_gender

In [13]:
df

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,1,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,0,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,1,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,0,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,0,27,154.0,58.0,10.0,81.0,39.8,35.0
...,...,...,...,...,...,...,...,...,...
14995,15644082,0,20,193.0,86.0,11.0,92.0,40.4,45.0
14996,17212577,0,27,165.0,65.0,6.0,85.0,39.2,23.0
14997,17271188,0,43,159.0,58.0,16.0,90.0,40.1,75.0
14998,18643037,1,78,193.0,97.0,2.0,84.0,38.3,11.0


In [14]:
X = df.drop(columns=['User_ID','Calories'],axis=1)
Y = df['Calories']

In [15]:
X

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,1,68,190.0,94.0,29.0,105.0,40.8
1,0,20,166.0,60.0,14.0,94.0,40.3
2,1,69,179.0,79.0,5.0,88.0,38.7
3,0,34,179.0,71.0,13.0,100.0,40.5
4,0,27,154.0,58.0,10.0,81.0,39.8
...,...,...,...,...,...,...,...
14995,0,20,193.0,86.0,11.0,92.0,40.4
14996,0,27,165.0,65.0,6.0,85.0,39.2
14997,0,43,159.0,58.0,16.0,90.0,40.1
14998,1,78,193.0,97.0,2.0,84.0,38.3


In [16]:
Y

0        231.0
1         66.0
2         26.0
3         71.0
4         35.0
         ...  
14995     45.0
14996     23.0
14997     75.0
14998     11.0
14999     98.0
Name: Calories, Length: 15000, dtype: float64

In [17]:
sns.set()

In [18]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [19]:
X_train.shape , X_test.shape

((12000, 7), (3000, 7))

In [20]:
model = RandomForestRegressor()

In [21]:
model.fit(X_train,Y_train)

In [22]:
test_data_prediction = model.predict(X_test)

In [23]:
test_data_prediction

array([129.18, 222.21,  36.93, ..., 146.17,  24.06,  89.59])

In [24]:
score = metrics.r2_score(test_data_prediction,Y_test)

score

0.9981862613597989

In [25]:
mae = metrics.mean_absolute_error(Y_test,test_data_prediction)
mae

1.6834299999999998

Predictive system

In [26]:
import pickle 
pickle.dump(model,open('model_1.pkl','wb'))