# Loading Libraries

This section is importing necessary libraries, including pandas for data manipulation, matplotlib and seaborn for data visualization, sklearn for machine learning tasks, and numpy for numerical operations.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Loading First Dataset

In [None]:
os.chdir(r'C:\Users\guilh\OneDrive\Área de Trabalho\FinalProject\FinalProject') 
#Loading the dataset from a CSV file named 'calories.csv' into a pandas DataFrame
data = pd.read_csv('calories.csv')
data.head() # Display the first few rows of the loaded dataset

# Loading Second Dataset

In [None]:
#Loading the second dataset from a CSV file named 'exercise.csv' into a pandas DataFrame
exercise_data = pd.read_csv('exercise.csv')
exercise_data.head()

# Merging the Datasets

In [None]:
#Merging the two datasets ('data' and 'exercise_data') on the 'User_ID' column
merged_data = pd.merge(data, exercise_data, on='User_ID')
merged_data.head()

***

# Checking Missing Values

Check for missing values in the merged dataset. The isnull() function returns a boolean mask of the same shape as the original DataFrame, where True indicates a missing value and False indicates a present value.   
The sum() function then counts the number of True values in each column, effectively counting the number of missing values.
missing_values = merged_data.isnull().sum()   

In [None]:
# Check for missing values in the merged dataset
missing_values = merged_data.isnull().sum()
missing_values

***

# Histogram of the Calories Column

The histogram will display the distribution of calories burned   
plt.figure(figsize=(10, 6)) # Set the figure size to 10 inches wide and 6 inches tall   
   
bins=30 specifies that the histogram should be divided into 30 bins
kde=True adds a kernel density estimate (KDE) to the histogram, which is a smoothed curve that estimates the underlying distribution of the data
sns.histplot(merged_data['Calories'], bins=30, kde=True)

In [None]:
#Creating a histogram of the 'Calories' column
plt.figure(figsize=(10, 6)) # Set the figure size to 10 inches wide and 6 inches tall
sns.histplot(merged_data['Calories'], bins=30, kde=True)# bins=30 specifies that the histogram should be divided into 30 bins
plt.title('Distribution of Calories Burned')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

***

# Scatter Plot to Explore the Relationship between Duration and Calories Burned

This code creates a scatter plot of the 'Duration' and 'Calories' columns in the merged_data DataFrame using the sns.scatterplot function from the seaborn library.   
The scatter plot displays the relationship between the duration of exercise and the number of calories burned, with the x-axis representing the duration of exercise in minutes and the y-axis representing the number of calories burned.   
The hue parameter is used to color the points based on the 'Gender' column, and the style parameter is used to change the marker style based on the 'Gender' column. The alpha parameter is used to set the transparency of the points.

In [None]:
#Creating a scatter plot to explore the relationship between Duration and Calories burned
plt.figure(figsize=(10, 6))# Set the figure size to 10 inches wide and 6 inches tall
sns.scatterplot(x='Duration', y='Calories', data=merged_data, hue='Gender', style='Gender', alpha=0.6)
plt.title('Relationship Between Exercise Duration and Calories Burned by Gender')
plt.xlabel('Duration (minutes)')
plt.ylabel('Calories Burned')
plt.legend(title='Gender')
plt.grid(True)
plt.show()

***

# Boxplot for Comparing the Heart Rate Across Different Genders During Exercise

This code creates a boxplot of the 'Heart_Rate' column in the merged_data DataFrame using the sns.boxplot function from the seaborn library.    
The boxplot displays the distribution of heart rate during exercise for each gender, with the x-axis representing the gender of the individual and the y-axis representing the heart rate in beats per minute (bpm).

In [None]:
#Creating a boxplot to compare the Heart Rate across different Genders during exercise
plt.figure(figsize=(10, 6))# Set the figure size to 10 inches wide and 6 inches tall
sns.boxplot(x='Gender', y='Heart_Rate', data=merged_data)
plt.title('Heart Rate Distribution by Gender During Exercise')
plt.xlabel('Gender')
plt.ylabel('Heart Rate (bpm)')
plt.show()

***