In [3]:
# Exploratory Data Analysis - Calorie Burn Rate Prediction
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set plotting style - use a style that is available in newer Matplotlib versions
plt.style.use('seaborn-v0_8')  # Updated style name
# Alternative styles that should work: 'ggplot', 'fivethirtyeight', 'classic'

# Download data directly from GitHub
# Note: Use your own GitHub username and repository name
calories_url = "https://raw.githubusercontent.com/BoraDemirkol/Bora-Demirkol-DSA-project/main/data/raw/calories.csv"
exercise_url = "https://raw.githubusercontent.com/BoraDemirkol/Bora-Demirkol-DSA-project/main/data/raw/exercise.csv"

# Read the datasets
try:
    calories_df = pd.read_csv(calories_url)
    exercise_df = pd.read_csv(exercise_url)

    print("First 5 rows of the Calories dataset:")
    print(calories_df.head())

    print("\nFirst 5 rows of the Exercise dataset:")
    print(exercise_df.head())

    # Merge the two datasets (based on User_ID)
    df = pd.merge(exercise_df, calories_df, on='User_ID')

    print("\nFirst 5 rows of the merged dataset:")
    print(df.head())

    # Data overview
    print("\nDataset information:")
    print(df.info())

    # Descriptive statistics
    print("\nDescriptive statistics:")
    print(df.describe().round(2))

    # Check for missing values
    print("\nMissing values:")
    print(df.isnull().sum())

    # Calculate BMI if height and weight are present
    if 'Weight' in df.columns and 'Height' in df.columns:
        df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)

        # Create BMI categories
        df['BMI_Category'] = pd.cut(df['BMI'],
                                bins=[0, 18.5, 25, 30, 100],
                                labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

    # Create age groups if Age is present
    if 'Age' in df.columns:
        df['Age_Group'] = pd.cut(df['Age'],
                              bins=[0, 25, 35, 45, 100],
                              labels=['<25', '25-35', '35-45', '45+'])

    # Correlation matrix
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.savefig('correlation_matrix.png')
    plt.show()

    # Identify highest correlations with Calories
    if 'Calories' in correlation_matrix.columns:
        calorie_correlations = correlation_matrix['Calories'].sort_values(ascending=False)
        print("Correlations with Calories:")
        print(calorie_correlations)

    # Distribution of numerical variables
    numerical_cols = df.select_dtypes(include=[np.number]).columns

    plt.figure(figsize=(15, 10))
    for i, col in enumerate(numerical_cols[:6]):  # Limit to first 6 columns for clarity
        plt.subplot(2, 3, i+1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.savefig('distributions.png')
    plt.show()

    # Scatter plots for relationship with Calories
    plt.figure(figsize=(15, 10))
    for i, col in enumerate([c for c in numerical_cols if c != 'Calories'][:5]):  # First 5 non-Calories columns
        plt.subplot(2, 3, i+1)
        sns.scatterplot(x=df[col], y=df['Calories'])
        plt.title(f'{col} vs Calories')
    plt.tight_layout()
    plt.savefig('scatter_plots.png')
    plt.show()

    # Categorical analysis if we have categorical variables
    if 'Gender' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='Gender', y='Calories', data=df)
        plt.title('Calories by Gender')
        plt.savefig('calories_by_gender.png')
        plt.show()

    if 'Age_Group' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='Age_Group', y='Calories', data=df)
        plt.title('Calories by Age Group')
        plt.savefig('calories_by_age.png')
        plt.show()

    if 'BMI_Category' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='BMI_Category', y='Calories', data=df)
        plt.title('Calories by BMI Category')
        plt.savefig('calories_by_bmi.png')
        plt.show()

    # Multi-variable analysis
    if 'Heart_Rate' in df.columns and 'Duration' in df.columns:
        plt.figure(figsize=(10, 8))
        scatter = sns.scatterplot(data=df, x='Heart_Rate', y='Duration',
                                 size='Calories', hue='Calories',
                                 sizes=(20, 200), palette='viridis')
        plt.title('Heart Rate vs Duration (size indicates Calories)')
        plt.savefig('heart_rate_duration_calories.png')
        plt.show()

    # Save processed dataset
    df.to_csv('processed_fitness_data.csv', index=False)
    print("Processed data saved to 'processed_fitness_data.csv'")

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: HTTP Error 404: Not Found
