In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 2. Preprocesamiento

In [2]:
df = pd.read_csv('dataframes/df_eda.csv')

## 2.1 Segmentación en ventanas de tiempo

In [5]:
def segment_by_window(group, window_size):
    """
    Segments data into windows of specified size.

    Args:
        group (pd.DataFrame): Subject's data.
        window_size (int): Size of each time window.

    Returns:
        list: List of DataFrames, each representing a window of data.
    """
    windows = []
    for start in range(0, len(group), window_size):
        end = start + window_size
        if end <= len(group):
            window = group.iloc[start:end].copy()
            windows.append(window)
    return windows

def add_window_metadata(windows, subject_id):
    """
    Adds metadata like window number, window ID, and majority label.

    Args:
        windows (list): List of window DataFrames.
        subject_id (int): Subject identifier.

    Returns:
        list: List of windows with added metadata.
    """
    for i, window in enumerate(windows):
        window['subject_id'] = subject_id
        window['window_number'] = i
        window['window_id'] = f"{subject_id}_{window['window_number'].iloc[0]}"

        majority_label = get_majority_label(window)
        window['labels_in_window'] = [majority_label] * len(window)
    return windows

def get_majority_label(window):
    """
    Gets the majority label for a window.

    Args:
        window (pd.DataFrame): Window data.

    Returns:
        str: Majority label, or None if no majority.
    """
    label_counts = window['activityID'].value_counts()
    majority_label = label_counts.idxmax() if label_counts.max() / len(window) > 0.5 else window['label'].iloc[0]
    return majority_label

def process_subject_data(mhealth_df, window_size):
    """
    Processes all subject data by segmenting and adding metadata.

    Args:
        mhealth_df (pd.DataFrame): Data of multiple subjects.
        window_size (int): Size of each time window.

    Returns:
        list: List of windows with metadata.
    """
    all_windows = []
    for subject_id, group in mhealth_df.groupby('subject_id'):
        group = group.reset_index(drop=True)

        windows = segment_by_window(group, window_size)
        windows = add_window_metadata(windows, subject_id)
        all_windows.extend(windows)
    
    return all_windows

def concatenate_windows(windows):
    """
    Concatenates all windows into a single DataFrame.

    Args:
        windows (list): List of window DataFrames.

    Returns:
        pd.DataFrame: Concatenated DataFrame of all windows.
    """
    return pd.concat(windows).reset_index(drop=True)

def display_basic_info(ventanas_df, windows):
    """
    Displays basic info about windows and their majority labels.

    Args:
        ventanas_df (pd.DataFrame): DataFrame of windows with metadata.
        windows (list): List of processed windows.
    """
    print(f"Ventanas generadas: {len(windows)}")
    print(f"Tamaño del DataFrame de ventanas: {ventanas_df.shape}")

    for window_id, group in ventanas_df.groupby('window_id'):
        majority_label = group['labels_in_window'].iloc[0]
        print(f"Window ID: {window_id} - Majority Label: {majority_label}")

# Parameters
window_size = 150  # 3 seconds * 50 Hz

print(df.shape)
windows = process_subject_data(df, window_size)

# Concatenate all windows into a single DataFrame
pamap_windowed = concatenate_windows(windows)

display_basic_info(pamap_windowed, windows)

(1942872, 55)
Ventanas generadas: 12948
Tamaño del DataFrame de ventanas: (1942200, 58)
Window ID: subject101_0 - Majority Label: 1
Window ID: subject101_1 - Majority Label: 1
Window ID: subject101_10 - Majority Label: 1
Window ID: subject101_100 - Majority Label: 1
Window ID: subject101_1000 - Majority Label: 4
Window ID: subject101_1001 - Majority Label: 4
Window ID: subject101_1002 - Majority Label: 4
Window ID: subject101_1003 - Majority Label: 4
Window ID: subject101_1004 - Majority Label: 4
Window ID: subject101_1005 - Majority Label: 4
Window ID: subject101_1006 - Majority Label: 4
Window ID: subject101_1007 - Majority Label: 4
Window ID: subject101_1008 - Majority Label: 4
Window ID: subject101_1009 - Majority Label: 4
Window ID: subject101_101 - Majority Label: 1
Window ID: subject101_1010 - Majority Label: 4
Window ID: subject101_1011 - Majority Label: 4
Window ID: subject101_1012 - Majority Label: 4
Window ID: subject101_1013 - Majority Label: 4
Window ID: subject101_1014 - 

In [8]:
windows_per_subject = pamap_windowed.groupby('subject_id').size()/150
print(f'Promedio de ventanas por cada sujeto: {windows_per_subject.mean()}')

windows_per_activity = pamap_windowed.groupby('activityID').size()/150
print(f'Promedio de ventanas por cada actividad: {windows_per_activity.mean()}')

Promedio de ventanas por cada sujeto: 1438.6666666666667
Promedio de ventanas por cada actividad: 1079.0


## 2.2 Extracción de características

Ahora bien, en el estudio de Lara (2011), dieron a relucir que la información provista por datos de signos vitales no es necesaria si las actividades a reconocer son caminando o bajando, pero, para actividades como corriendo, estar sentado o asendiendo pueden ser de utilidad. Sin embargo, en el artículo de Rehman (2024), hacen uso solamente de los sensores del acelerómetro giroscopio y magnetométro, debido a que otro estudió de igual forma lo trabajó así. Por ello nosotros tomamos el mismo enfoque.



In [9]:
class InertialSensor:
    def __init__(self, x, y, z):
        """
        Inicializa el acelerómetro con sus 3 señales.
        
        Parámetros:
          x: array-like, señal del eje x.
          y: array-like, señal del eje y.
          z: array-like, señal del eje z.
        """
        self.x = np.array(x)
        self.y = np.array(y)
        self.z = np.array(z)
        
    def compute_magnitude(self):
        return np.sqrt(self.x**2 + self.y**2 + self.z**2)
    
    def extract_features(self):
        # 1. Medias de cada eje
        mean_x = np.mean(self.x)
        mean_y = np.mean(self.y)
        mean_z = np.mean(self.z)
        
        # 2. Desviaciones estándar
        std_x = np.std(self.x)
        std_y = np.std(self.y)
        std_z = np.std(self.z)
        
        # 3. Valores máximos
        max_x = np.max(self.x)
        max_y = np.max(self.y)
        max_z = np.max(self.z)
        
        # 4. Correlaciones entre cada par de ejes
        corr_xy = np.corrcoef(self.x, self.y)[0, 1]
        corr_xz = np.corrcoef(self.x, self.z)[0, 1]
        corr_yz = np.corrcoef(self.y, self.z)[0, 1]
        
        # 5,6,7,8. Features derivadas de la magnitud
        magnitude = self.compute_magnitude()
        mean_magnitude = np.mean(magnitude)
        std_magnitude = np.std(magnitude)
        auc = np.sum(magnitude)
        mean_diff = np.mean(np.diff(magnitude)) if len(magnitude) > 1 else 0.0
        
        # Vector de features
        feature_vector = np.array([
            mean_x, mean_y, mean_z,
            std_x, std_y, std_z,
            max_x, max_y, max_z,
            corr_xy, corr_xz, corr_yz,
            mean_magnitude, std_magnitude, auc, mean_diff
        ])
        
        return feature_vector

In [10]:
import numpy as np
df_to_extract_features = pamap_windowed
numeric_features_list = []
labels = []

for seg, group in df_to_extract_features.groupby("window_id"):
    #Features obtenidos de los sensores en la mano
    H_acc_features = InertialSensor(
        group.IMU_Hand_acceleration_16g_x,
        group.IMU_Hand_acceleration_16g_y,
        group.IMU_Hand_acceleration_16g_z
    ).extract_features()
    H_gyro_features = InertialSensor(
        group.IMU_Hand_gyroscope_x,
        group.IMU_Hand_gyroscope_y,
        group.IMU_Hand_gyroscope_z,
    ).extract_features()
    H_mag_features = InertialSensor(
        group.IMU_Hand_magnetometer_x,
        group.IMU_Hand_magnetometer_y,
        group.IMU_Hand_magnetometer_z,
    ).extract_features()
        #Features obtenidos de los sensores en el pecho
    C_acc_features = InertialSensor(
        group.IMU_Chest_acceleration_16g_x,
        group.IMU_Chest_acceleration_16g_y,
        group.IMU_Chest_acceleration_16g_z
    ).extract_features()
    C_gyro_features = InertialSensor(
        group.IMU_Chest_gyroscope_x,
        group.IMU_Chest_gyroscope_y,
        group.IMU_Chest_gyroscope_z,
    ).extract_features()
    C_mag_features = InertialSensor(
        group.IMU_Chest_magnetometer_x,
        group.IMU_Chest_magnetometer_y,
        group.IMU_Chest_magnetometer_z,
    ).extract_features()
    #Features obtenidos de los sensores en el tobillo
    A_acc_features = InertialSensor(
        group.IMU_Ankle_acceleration_16g_x,
        group.IMU_Ankle_acceleration_16g_y,
        group.IMU_Ankle_acceleration_16g_z
    ).extract_features()
    A_gyro_features = InertialSensor(
        group.IMU_Ankle_gyroscope_x,
        group.IMU_Ankle_gyroscope_y,
        group.IMU_Ankle_gyroscope_z,
    ).extract_features()
    A_mag_features = InertialSensor(
        group.IMU_Ankle_magnetometer_x,
        group.IMU_Ankle_magnetometer_y,
        group.IMU_Ankle_magnetometer_z,
    ).extract_features()
    
    feature_vector = np.concatenate([
        H_acc_features, H_gyro_features, H_mag_features,
        C_acc_features, C_gyro_features, C_mag_features,
        A_acc_features, A_gyro_features, A_mag_features
    ]).astype(float)
    
    numeric_features_list.append(feature_vector)
    labels.append(group["labels_in_window"].iloc[0])

sensor_locations = ["hand"]
sensor_types = ["acc", "gyro", "mag"]
feature_names = [
    "mean_x", "mean_y", "mean_z", "std_x", "std_y", "std_z",
    "max_x", "max_y", "max_z", "corr_xy", "corr_xz", "corr_yz",
    "mean_mag", "std_mag", "auc", "mean_diff"
]

feature_columns = [
    f"{loc}_{typ}_{feat}"
    for loc in sensor_locations
    for typ in sensor_types
    for feat in feature_names
]
        
sensor_locations = ["chest"]
sensor_types = ["acc", "gyro", "mag"]
feature_columns.extend([
    f"{loc}_{typ}_{feat}"
    for loc in sensor_locations
    for typ in sensor_types
    for feat in feature_names
])

sensor_locations = ["ankle"]
sensor_types = ["acc", "gyro", "mgnt"]
feature_columns.extend([
    f"{loc}_{typ}_{feat}"
    for loc in sensor_locations
    for typ in sensor_types
    for feat in feature_names
])

# Create DataFrame for numeric features and convert to float
df_numeric = pd.DataFrame(np.array(numeric_features_list), columns=feature_columns)

# Create DataFrame for labels
df_labels = pd.DataFrame(labels, columns=["activity"])

# Concatenate numeric features and labels
pamap_features = pd.concat([df_numeric, df_labels], axis=1)

# Optionally, check the data types to confirm
pamap_features.shape
pamap_features

Unnamed: 0,hand_acc_mean_x,hand_acc_mean_y,hand_acc_mean_z,hand_acc_std_x,hand_acc_std_y,hand_acc_std_z,hand_acc_max_x,hand_acc_max_y,hand_acc_max_z,hand_acc_corr_xy,...,ankle_mgnt_max_y,ankle_mgnt_max_z,ankle_mgnt_corr_xy,ankle_mgnt_corr_xz,ankle_mgnt_corr_yz,ankle_mgnt_mean_mag,ankle_mgnt_std_mag,ankle_mgnt_auc,ankle_mgnt_mean_diff,activity
0,1.476462,7.183104,6.263181,1.493137,0.891178,0.773317,3.184930,9.38736,8.69773,-0.062893,...,-35.18610,-57.11420,-0.429460,-0.151932,0.104818,92.160149,0.398534,13824.022413,0.002502,1
1,-1.580164,9.098362,3.046503,0.698351,1.264988,1.029115,1.103640,12.01160,5.14512,-0.448855,...,-35.23700,-57.83440,-0.925066,0.112084,-0.289666,92.330535,0.427368,13849.580232,-0.001155,1
2,0.318060,5.719075,7.957812,0.087042,0.177270,0.135302,0.613322,6.04065,8.26579,-0.198240,...,44.94040,-8.61565,-0.190123,0.078203,-0.055490,47.353755,0.479386,7103.063286,0.004484,1
3,3.793341,-3.944923,-8.212290,0.060836,0.072421,0.094537,3.958310,-3.77111,-7.97182,0.242557,...,21.67320,40.93500,-0.087881,-0.082397,0.076284,49.727292,0.432894,7459.093851,-0.004624,1
4,-2.701973,6.519174,6.585910,1.101292,1.324894,0.875959,0.135733,10.32980,9.59792,-0.256611,...,-34.46280,3.79722,-0.736550,0.023367,-0.025351,62.663169,0.406113,9399.475384,-0.003671,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12943,0.538799,11.090765,-1.331692,4.820491,12.603037,5.292219,12.310300,34.85990,10.50140,-0.208746,...,-17.27500,23.45170,-0.956532,0.873342,-0.758037,46.338818,0.594737,6950.822626,-0.004561,24
12944,0.161949,7.176982,-2.381768,5.696166,15.915406,7.964756,13.190700,36.25800,30.55700,0.087518,...,,,,,,,,,,24
12945,-2.865762,11.344199,1.629505,2.861371,5.903191,3.707454,8.405530,28.46640,18.23930,-0.093056,...,23.81200,16.76660,-0.267715,0.438702,-0.880136,46.609012,0.742160,6991.351828,0.006192,24
12946,-2.440181,7.321234,-0.454268,5.156909,10.600009,7.625614,9.904520,28.85750,12.95720,-0.129110,...,-10.25700,15.78230,-0.981514,0.140041,-0.052437,46.481226,0.730255,6972.183902,-0.000203,24


In [11]:
pamap_features.shape

(12948, 145)

Ahora con estas features podemos hacer las siguientes fases

## 2.3 Train Test Splitting, Class imabalnce & Normalization


A continuación, se presenta la pipeline que representa los pasos para el entrenamiento de los modelos. Para un correcto entrenamiento se deben tomar los siguientes pasos:
1. Oversampling: debido al gran desbalance de clases, se opta por técnicas de sobremuestreo que han sido utilizadas en la literatura, entre las que han demostrado mayor efectividad se encuentra SMOTE.
2. Escalado: Debido a que en el EDA se puede apreciar que algunas features están en diferentes escalas, hay que aplicar alguna técnica de escalado que evite que las features con mayor escala representen mayor importancia, es por eso que se opta por un MinMaxScaler que igualmente ha demostrado buenos resultados en la literatura.
3. Entrenamiento con el modelo

In [None]:
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE  


X = pamap_features.drop(columns=['activity']).values  
y = pamap_features['activity'].values 
 
scaler = MinMaxScaler()

def execute_pipeline(model, X, y):
    pipeline = make_pipeline(
        SMOTE(sampling_strategy='all', random_state=42), 
        scaler,  
        model
    )
    pipeline.fit(X, y)
    scorespipe = cross_val_score(pipeline, X, y, cv=10, scoring="f1_macro")
    return scorespipe