In [7]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report

class FitnessDataProcessor:
    def __init__(self, exercise_file='exercise.csv', calories_file='calories.csv'):
        self.exercise_file = exercise_file
        self.calories_file = calories_file
        
    def load_exercise_dataset(self):
        """
        Load exercise dataset with specified columns
        """
        try:
            # Read the dataset
            df = pd.read_csv(self.exercise_file)
            
            # Validate columns
            required_columns = [
                'User_ID', 'Gender', 'Age', 'Height', 
                'Weight', 'Duration', 'Heart_Rate', 'Body_Temp'
            ]
            
            # Check if all required columns exist
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"Missing columns: {missing_columns}")
            
            # Select only required columns
            df = df[required_columns]
            
            # Basic data cleaning
            numeric_columns = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
            for col in numeric_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Handle missing values
            df.dropna(inplace=True)
            
            return df
        
        except Exception as e:
            print(f"Error loading exercise dataset: {e}")
            return self.create_sample_exercise_dataset()

    def load_calories_dataset(self):
        """
        Load calories dataset with specified columns
        """
        try:
            # Read the dataset
            df = pd.read_csv(self.calories_file)
            
            # Validate columns
            required_columns = ['User_ID', 'Calories']
            
            # Check if all required columns exist
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"Missing columns: {missing_columns}")
            
            # Select only required columns
            df = df[required_columns]
            
            # Convert Calories to numeric
            df['Calories'] = pd.to_numeric(df['Calories'], errors='coerce')
            
            # Handle missing values
            df.dropna(inplace=True)
            
            return df
        
        except Exception as e:
            print(f"Error loading calories dataset: {e}")
            return self.create_sample_calories_dataset()

    def create_sample_exercise_dataset(self):
        """
        Create a sample exercise dataset when actual data is not available
        """
        return pd.DataFrame({
            'User_ID': range(1, 101),
            'Gender': np.random.choice(['Male', 'Female'], 100),
            'Age': np.random.randint(18, 65, 100),
            'Height': np.random.uniform(150, 200, 100),
            'Weight': np.random.uniform(50, 100, 100),
            'Duration': np.random.uniform(30, 120, 100),
            'Heart_Rate': np.random.randint(60, 180, 100),
            'Body_Temp': np.random.uniform(36.5, 38.5, 100)
        })

    def create_sample_calories_dataset(self):
        """
        Create a sample calories dataset when actual data is not available
        """
        return pd.DataFrame({
            'User_ID': range(1, 101),
            'Calories': np.random.randint(200, 800, 100)
        })

    def merge_datasets(self, exercise_df, calories_df):
        """
        Merge exercise and calories datasets
        """
        return pd.merge(exercise_df, calories_df, on='User_ID')

    def preprocess_data(self, merged_df):
        """
        Preprocess merged dataset for machine learning
        """
        # Create a copy of the dataframe
        df_encoded = merged_df.copy()
        
        # One-hot encode Gender using pandas get_dummies
        df_encoded = pd.get_dummies(df_encoded, columns=['Gender'], prefix='Gender')
        
        # Ensure all required columns exist
        required_features = [
            'Age', 'Height', 'Weight', 'Duration', 
            'Heart_Rate', 'Body_Temp', 
            'Gender_Male', 'Gender_Female'
        ]
        
        # Add missing columns with default 0 if not present
        for col in required_features:
            if col not in df_encoded.columns:
                df_encoded[col] = 0
        
        # Select features and targets
        X = df_encoded[required_features]
        y_calories = df_encoded['Calories']
        y_heart_rate = df_encoded['Heart_Rate']
        
        # Split data
        X_train_calories, X_test_calories, y_train_calories, y_test_calories = train_test_split(
            X, y_calories, test_size=0.2, random_state=42
        )
        
        X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(
            X, y_heart_rate, test_size=0.2, random_state=42
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled_calories = scaler.fit_transform(X_train_calories)
        X_test_scaled_calories = scaler.transform(X_test_calories)
        
        X_train_scaled_hr = scaler.fit_transform(X_train_hr)
        X_test_scaled_hr = scaler.transform(X_test_hr)
        
        return {
            'X_train_calories': X_train_scaled_calories,
            'X_test_calories': X_test_scaled_calories,
            'y_train_calories': y_train_calories,
            'y_test_calories': y_test_calories,
            'X_train_hr': X_train_scaled_hr,
            'X_test_hr': X_test_scaled_hr,
            'y_train_hr': y_train_hr,
            'y_test_hr': y_test_hr,
            'scaler': scaler
        }

    def train_and_save_models(self, prepared_data):
        """
        Train machine learning models and save them
        """
        # Calories Prediction Model (Regression)
        calories_model = RandomForestRegressor(n_estimators=100, random_state=42)
        calories_model.fit(
            prepared_data['X_train_calories'], 
            prepared_data['y_train_calories']
        )
        
        # Heart Rate Classification Model
        hr_model = RandomForestClassifier(n_estimators=100, random_state=42)
        hr_model.fit(
            prepared_data['X_train_hr'], 
            prepared_data['y_train_hr']
        )
        
        # Evaluate Models
        calories_pred = calories_model.predict(prepared_data['X_test_calories'])
        hr_pred = hr_model.predict(prepared_data['X_test_hr'])
        
        print("Calories Prediction MSE:", 
              mean_squared_error(prepared_data['y_test_calories'], calories_pred))
        print("\nHeart Rate Classification Report:")
        print(classification_report(
            prepared_data['y_test_hr'], 
            hr_pred
        ))
        
        # Save models using pickle
        with open('calories_model.pkl', 'wb') as f:
            pickle.dump(calories_model, f)
        
        with open('heart_rate_model.pkl', 'wb') as f:
            pickle.dump(hr_model, f)
        
        # Save scaler
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(prepared_data['scaler'], f)
        
        return calories_model, hr_model

    def process_datasets(self):
        """
        Main method to process datasets
        """
        # Load datasets
        exercise_df = self.load_exercise_dataset()
        calories_df = self.load_calories_dataset()
        
        # Merge datasets
        merged_df = self.merge_datasets(exercise_df, calories_df)
        
        # Preprocess data
        prepared_data = self.preprocess_data(merged_df)
        
        # Train and save models
        self.train_and_save_models(prepared_data)
        
        return merged_df, prepared_data

# Main execution
if __name__ == '__main__':
    processor = FitnessDataProcessor()
    merged_df, prepared_data = processor.process_datasets()
    print("Datasets processed and models saved successfully!")

Calories Prediction MSE: 18.8510306

Heart Rate Classification Report:
              precision    recall  f1-score   support

        67.0       0.00      0.00      0.00         1
        70.0       0.00      0.00      0.00         2
        71.0       0.00      0.00      0.00         2
        72.0       0.40      0.40      0.40         5
        73.0       0.00      0.00      0.00         7
        74.0       0.00      0.00      0.00         7
        75.0       0.08      0.09      0.09        11
        76.0       0.19      0.17      0.18        18
        77.0       0.40      0.27      0.32        22
        78.0       0.62      0.54      0.58        28
        79.0       0.52      0.68      0.59        38
        80.0       0.66      0.69      0.68        36
        81.0       0.83      0.83      0.83        41
        82.0       0.90      0.78      0.83        67
        83.0       0.86      0.94      0.89        63
        84.0       0.89      0.97      0.93        69
        85

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Datasets processed and models saved successfully!
