## Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Data Loading

In [None]:
# Define the file name
file_name = 'aw_fb_data.csv'

# Load the data
try:
    data = pd.read_csv(file_name)
    print(f"Data loaded successfully. Shape: {data.shape}")
except FileNotFoundError:
    print(f"File {file_name} not found in the current directory.")

## Data Preprocessing

### Handling Missing Values

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Handling missing values
# For numerical columns, we'll use median imputation
numerical_cols = ['age', 'height', 'weight', 'steps', 'heart_rate', 'calories', 'distance']
numerical_imputer = SimpleImputer(strategy='median')
data[numerical_cols] = numerical_imputer.fit_transform(data[numerical_cols])

# For categorical columns, we'll use the most frequent strategy
categorical_cols = ['gender']
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# Verify no missing values remain
print("\nMissing Values after Imputation:\n", data.isnull().sum())

### Handling Outliers

In [None]:
# Function to remove outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return filtered_df

# Apply outlier removal to numerical columns
for col in numerical_cols:
    before = data.shape[0]
    data = remove_outliers(data, col)
    after = data.shape[0]
    print(f"Removed {before - after} outliers from {col}")

print(f"\nData shape after outlier removal: {data.shape}")

### Encoding Categorical Variables

In [None]:
# Encode 'gender' column
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])

# Display unique values after encoding
print("Encoded 'gender' values:", data['gender'].unique())

### Feature Scaling

In [None]:
# Initialize scaler
scaler = StandardScaler()

# Fit and transform the data
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print("\nData after scaling:")
data.head()

## Exploratory Data Analysis (EDA)

### Descriptive Statistics

In [None]:
print(data.describe())

### Correlation Analysis

In [None]:
# Correlation matrix
corr_matrix = data.corr()

# Plotting the heatmap
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Visualizations

In [None]:
# Distribution of steps
plt.figure(figsize=(8,6))
sns.histplot(data['steps'], bins=30, kde=True)
plt.title('Distribution of Steps')
plt.show()

# Scatter plot between heart rate and steps
plt.figure(figsize=(8,6))
sns.scatterplot(x='steps', y='heart_rate', hue='gender', data=data)
plt.title('Heart Rate vs Steps by Gender')
plt.show()

# Boxplot of calories by gender
plt.figure(figsize=(8,6))
sns.boxplot(x='gender', y='calories', data=data)
plt.title('Calories Burned by Gender')
plt.show()

## Feature Engineering

In [None]:
# Example: BMI calculation
# Since height is in standardized form, let's assume original height was in cm
# To calculate BMI = weight (kg) / (height (m))^2
# We'll reverse the scaling for height and weight temporarily

# Inverse transform height and weight
height_scaled = data['height']
weight_scaled = data['weight']

# Assuming mean=0 and std=1 for standardized data
# BMI = (weight * std_weight + mean_weight) / ((height * std_height + mean_height)/100)^2
# For simplicity, we'll skip inverse scaling and treat BMI as a derived feature in scaled form
data['BMI'] = weight_scaled / (height_scaled ** 2)

# Display the new feature
print(data[['BMI']].head())

## Model Training

### Defining Target Variables

In [None]:
# Define target variable
# Let's assume heart_rate > 100 (after inverse scaling) is abnormal
# Since data is scaled, we'll set a threshold accordingly
# For simplicity, we'll convert it back to original scale approximately
# Assuming mean heart rate is around 70 bpm and std is around 10

# Define abnormal heart rate as scaled > (100 - 70)/10 = 3
data['abnormal_heart_rate'] = data['heart_rate'].apply(lambda x: 1 if x > 3 else 0)

print(data['abnormal_heart_rate'].value_counts())

### Splitting the Data

In [None]:
# Features and target
X = data.drop(['abnormal_heart_rate'], axis=1)
y = data['abnormal_heart_rate']

# Splitting into train (70%), validation (15%), and test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

### Model Selection

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"{name} Validation Accuracy: {acc:.4f}")

### Hyperparameter Tuning

In [None]:
# Assume Random Forest performed the best
rf = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

### Model Evaluation

In [None]:
# Best model
best_rf = grid_search.best_estimator_

# Predictions on test set
y_test_pred = best_rf.predict(X_test)

# Evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")