In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Step 1: Load the Data

* Import the necessary libraries (pandas, numpy, matplotlib, seaborn)

* Load the dataset into a pandas DataFrame using pd.read_csv() or from the provided file

* Display the first few rows using .head() to understand the structure

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


df = pd.read_csv('/kaggle/input/fitness-tracker-dataset/gym_members_exercise_tracking_synthetic_data.csv')  # Replace with your file name
print(df.head())  # Display first few rows to understand structure



#### Step 2: Inspect the Data
1. Shape: Use .shape to check the number of rows and columns.
2. Columns: Use .columns to list column names.
3. Info: Use .info() to examine data types and non-null counts for each column.
4. Description: Use .describe() to summarize numeric columns (mean, min, max, etc.).

In [None]:
print(df.shape)  # Outputs (rows, columns)
print(df.columns)  # Lists all column names
print(df.info())  # Shows data types and missing value counts
print(df.describe())  # Summarizes numeric columns


#### Step 3: Identify and Handle Missing Values
1. Use .isnull().sum() to check the number of missing values per column.
2. Visualize missing data using a heatmap (sns.heatmap) to identify patterns.
3. Handle missing values:
* Age, Weight (kg), Height (m), and Numeric Columns:
   * Impute missing values using the mean or median.
* Gender and Workout_Type:
   * Impute missing values using the mode (most frequent value).
* Document and justify your imputation strategy.

In [None]:
print(df.isnull().sum())  # Count missing values per column
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Weight (kg)'] = df['Weight (kg)'].fillna(df['Weight (kg)'].median())
df['Height (m)'] = df['Height (m)'].fillna(df['Height (m)'].mean())
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Workout_Type'] = df['Workout_Type'].fillna(df['Workout_Type'].mode()[0])


#### Step 4: Check for Duplicates
1. Use .duplicated().sum() to check for duplicate rows.
2. Remove duplicates, if any, using .drop_duplicates().

In [None]:
print(df.duplicated().sum())  # Count duplicate rows
df.drop_duplicates(inplace=True)


#### Step 5: Validate Data
1. Numeric Columns:
    * Check for invalid entries (e.g., special characters like ? in Max_BPM).
    * Convert columns like Max_BPM to numeric using pd.to_numeric() with errors='coerce'.
    * Replace invalid values with NaN and impute as needed.
2. Categorical Columns:
    * Use .unique() to check for inconsistencies in Gender, Workout_Type, etc.
    * Standardize inconsistent values (e.g., Male, M → Male). Male).

In [None]:
df['Max_BPM'] = pd.to_numeric(df['Max_BPM'], errors='coerce')
df['Max_BPM'] = df['Max_BPM'].fillna(df['Max_BPM'].mean())
print(df['Gender'].unique())  # Check unique values
df['Gender'] = df['Gender'].replace({'M': 'Male', 'F': 'Female'})
print(df['Workout_Type'].unique())  # Check unique values
# Add replacements if inconsistencies are found


#### Step 6: Create New Features
1. BMI Validation:
* Verify if the BMI column is consistent with the formula:
BMI=Weight (kg)Height (m)2\text{BMI} = \frac{\text{Weight (kg)}}{\text{Height
(m)}^2}BMI=Height (m)2Weight (kg)
* Recalculate BMI where inconsistencies are found.
2. Workout Intensity:
* Create a new column: Workout_Intensity = Avg_BPM / Max_BPM.ax_BPM.

In [None]:
df['BMI_Calculated'] = df['Weight (kg)'] / (df['Height (m)'] ** 2)
df['BMI'] = np.where(abs(df['BMI'] - df['BMI_Calculated']) > 0.1, df['BMI_Calculated'], df['BMI'])
df.drop(columns=['BMI_Calculated'], inplace=True)
df['Workout_Intensity'] = df['Avg_BPM'] / df['Max_BPM']


#### Step 7: Explore Data Distributions
1. Plot histograms for numeric columns (Age, Calories_Burned, etc.) to understand their distributions.
2. Use box plots to check for outliers in columns like Age, BMI, and Calories_Burned.
3. Examine the distribution of categorical columns (Gender, Workout_Type) using bar plots.

In [None]:
df[['Age', 'Calories_Burned']].hist(bins=20, figsize=(10, 5))
plt.show()
sns.boxplot(x=df['BMI'])
plt.title('Boxplot for BMI')
plt.show()
sns.countplot(x='Gender', data=df)
plt.title('Gender Distribution')
plt.show()


Step 8: Handle Outliers
1. Use the IQR method to detect outliers in numeric columns.
2. Decide whether to    *
o Remove outlier    *.
o Transform them (e.g., log transformatio    *).
o Cap them (e.g., set to a specific threshold).

In [None]:
Q1 = df['BMI'].quantile(0.25)
Q3 = df['BMI'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['BMI'] < lower_bound) | (df['BMI'] > upper_bound)]
df = df[~((df['BMI'] < lower_bound) | (df['BMI'] > upper_bound))]


#### Step 9: Analyze Relationships
1. Correlation:    *
o Use .corr() and visualize with a heatmap to find relationships between numeric columns (e.g.,
Calories_Burned, Session_Duration).

2. Categorical vs. Nume    *ic:
o Compare Calories_Burned and Workout_Type using a bar     *lot.
o Analyze differences in BMI across Gender using a box plot.
3. Multi-Va    *iable:
o Use pair plots (sns.pairplot) to analyze relationships between key metrics (e.g., BMI,
Calories_Burned, Workout_Frequency).

In [None]:


# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Replace inf values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Bar plot for Calories Burned by Workout Type
plt.figure(figsize=(8, 6))  # Create the figure for this plot
sns.barplot(x='Workout_Type', y='Calories_Burned', data=df, errorbar=None)  # Replaced ci=None with errorbar=None
plt.title('Calories Burned by Workout Type')
plt.xlabel('Workout Type')
plt.ylabel('Calories Burned')
plt.xticks(rotation=45)
plt.show()  # Show the plot

# Box plot for BMI by Gender
plt.figure(figsize=(8, 6))  # Create the figure for this plot
sns.boxplot(x='Gender', y='BMI', data=df)
plt.title('BMI Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('BMI')
plt.show()  # Show the plot

# Pair plot for selected variables
plt.figure(figsize=(12, 10))  # Create the figure for this plot
sns.pairplot(df, vars=['BMI', 'Calories_Burned'], diag_kind='kde', hue='Gender')
plt.suptitle('Pair Plot of Key Metrics', y=1.02)  # Add title
plt.show()  # Show the plot



#### Step 10: Encode Categorical Variables
1. Convert Gender and Workout_Type to numeric formats:    *
o Use one-hot encoding (pd.get_dummies()) or label encoding.

In [None]:
# Descriptive statistics for numerical columns
df.describe()

# Alternatively, you can get specific statistics
mean_bmi = df['BMI'].mean()
std_bmi = df['BMI'].std()
mean_calories_burned = df['Calories_Burned'].mean()
std_calories_burned = df['Calories_Burned'].std()

print(f"Mean BMI: {mean_bmi:.2f}")
print(f"Standard Deviation of BMI: {std_bmi:.2f}")
print(f"Mean Calories Burned: {mean_calories_burned:.2f}")
print(f"Standard Deviation of Calories Burned: {std_calories_burned:.2f}")


#### Step 11: Normalize Numeric Columns
1. Normalize or standardize columns with large ranges (Calories_Burned, Session_Duration, etc.) if
needed for further analysis or machine learning.

In [None]:
# Ensure the correct columns are present
print(df.columns)

# Assuming 'BMI', 'Calories_Burned', 'Age' are available, update accordingly if needed
corr_matrix = df[['BMI', 'Calories_Burned', 'Age']].corr()

# Display the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Key Variables')
plt.show()



#### Step 12: Summarize Findings
1. Highlight key insights from the data:
    * Trends in Calories_Burned based on Workout_Type and Gender.
    * Correlation between Session_Duration and Calories_Burned.
    * Any notable differences in BMI across Workout_Frequency.ncy.

In [None]:
# Histogram for BMI
plt.figure(figsize=(8, 6))
sns.histplot(df['BMI'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

# Histogram for Calories Burned
plt.figure(figsize=(8, 6))
sns.histplot(df['Calories_Burned'], bins=20, kde=True, color='orange')
plt.title('Distribution of Calories Burned')
plt.xlabel('Calories Burned')
plt.ylabel('Frequency')
plt.show()
