# Phase 3: Feature Extraction

## Objective
To extract interpretable time-domain features from preprocessed PPG signals
for machine learning and comparative analysis.

In [None]:
import numpy as np
import pandas as pd
from scipy.signal import find_peaks

In [None]:
signals = np.load("processed_signals.npy")
labels = np.load("labels.npy", allow_pickle=True)

In [None]:
mean_vals = np.mean(signals, axis=1)
std_vals = np.std(signals, axis=1)

In [None]:
fs = 125  # Sampling frequency
peak_counts = []

for signal in signals:
    peaks, _ = find_peaks(signal, distance=fs*0.4)
    peak_counts.append(len(peaks))

peak_counts = np.array(peak_counts)


In [None]:
duration_sec = signals.shape[1] / fs
heart_rates = (peak_counts / duration_sec) * 60


In [None]:
features_df = pd.DataFrame({
    "mean": mean_vals,
    "std": std_vals,
    "peak_count": peak_counts,
    "heart_rate": heart_rates,
    "label": labels
})

features_df.head()


Unnamed: 0,mean,std,peak_count,heart_rate,label
0,0.014267,0.707625,21,78.75,MI
1,0.013232,0.711414,19,71.25,MI
2,0.010877,0.71087,20,75.0,MI
3,0.012534,0.712748,19,71.25,MI
4,0.014048,0.708126,19,71.25,MI


In [None]:
features_df.groupby("label").mean()


Unnamed: 0_level_0,mean,std,peak_count,heart_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MI,0.013207,0.714354,18.576507,69.661901
Normal,0.013104,0.767372,10.546802,39.550507


## Time-Domain Feature Observations

- Mean and standard deviation capture signal variability.
- Peak count and heart rate reflect rhythmic properties.
- Feature distributions show observable differences between classes.


In [None]:
ipi_mean = []
ipi_std = []

for signal in signals:
    peaks, _ = find_peaks(signal, distance=fs*0.4)
    
    if len(peaks) > 1:
        intervals = np.diff(peaks) / fs  # convert to seconds
        ipi_mean.append(np.mean(intervals))
        ipi_std.append(np.std(intervals))
    else:
        ipi_mean.append(0)
        ipi_std.append(0)

ipi_mean = np.array(ipi_mean)
ipi_std = np.array(ipi_std)


In [None]:
dominant_freqs = []

for signal in signals:
    fft_vals = np.abs(np.fft.rfft(signal))
    freqs = np.fft.rfftfreq(len(signal), d=1/fs)
    
    dominant_freqs.append(freqs[np.argmax(fft_vals)])

dominant_freqs = np.array(dominant_freqs)


In [None]:
spectral_energy = np.sum(fft_vals**2)


In [None]:
features_df["ipi_mean"] = ipi_mean
features_df["ipi_std"] = ipi_std
features_df["dominant_freq"] = dominant_freqs

features_df.head()


Unnamed: 0,mean,std,peak_count,heart_rate,label,ipi_mean,ipi_std,dominant_freq
0,0.014267,0.707625,21,78.75,MI,0.7648,0.272886,0.75
1,0.013232,0.711414,19,71.25,MI,0.845333,0.324896,0.75
2,0.010877,0.71087,20,75.0,MI,0.798737,0.24923,0.75
3,0.012534,0.712748,19,71.25,MI,0.847556,0.29193,0.75
4,0.014048,0.708126,19,71.25,MI,0.843111,0.271216,0.75


## Phase 3 Summary

- Time-domain, rhythm-based, and frequency-domain features were extracted.
- Features capture amplitude, regularity, and spectral characteristics.
- These features provide interpretable representations for ML modeling.


In [None]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


ModuleNotFoundError: No module named 'sklearn'