In [3]:
import eda_prep as ep
import os 
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns

1. how many categories are in each folder
2. how many recordings in each category
3. Min/Max/Avg per category
4. Missing values
5. Find outliers
   1. per category
   2. visualizen

Feature Engineering
- Peak value
- Peak-to-peak value
- Amplitude
- Energy
- Frequency
- Peak-to-peak
- ARV: Average rectified value
- Weighted-ARV: 
- FFT: Frequency spektrum, different frequency parts (maybe)
   
Clustering

Data Augmentation

# Merge to df

In [4]:
def merge_geophone_falls(folder_path):
    """
    Merges all 'fall' data from CSV files in the given geophone folder.
    
    Parameters:
    - folder_path (str): Path to the geophone folder containing CSV files.
    
    Returns:
    - pd.DataFrame: Merged DataFrame with labeled columns.
    """
    all_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            parts = filename.split("_")
            
            if len(parts) < 5:
                print(f"Skipping file with unexpected format: {filename}")
                continue  # Skip files that don't match expected format
            
            fall_type = parts[0]  # First part of the filename
            fall_binary = parts[1]  # Check for fall keyword
            distance = parts[2]  # Third part of the filename
            person = parts[3]  # AW, AD, or 0
            floor_type = parts[-1].replace(".csv", "")  # Last part without ".csv"
            
            # Load CSV
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, header=None)
            
            # Take only the first 500 values
            # values = df.iloc[0, :500].tolist() if df.shape[1] >= 500 else df.iloc[0, :].tolist() + [None] * (500 - df.shape[1])
            for _, row in df.iterrows():
                # Ensure only 500 values are added
                values = row[:500].tolist() if len(row) >= 500 else row.tolist() + [None] * (500 - len(row))
                all_data.append([fall_type, fall_binary, distance, person, floor_type] + values)

    # Create DataFrame
    column_names = ["activity", "fall_binary", "distance_m", "person_binary", "floor"] + [f"value_{i}" for i in range(1, 501)]
    merged_df = pd.DataFrame(all_data, columns=column_names)

    return merged_df

# Example usage
df_geophone = merge_geophone_falls("../geophone")

# Add Weight Column

In [5]:
def add_weight_column(df):
    """
    Adds a new column 'weight' to the dataframe based on the person's identity and activity type.

    - 'AW' (Anna) -> 60kg
    - 'AD' (Adrian/David) -> 75kg
    - 'FOB' (Bag) -> 6kg
    - 'FOL' (Blackroll) -> 0.2kg

    Parameters:
    df (pd.DataFrame): Input dataframe containing 'person' and 'activity' columns.

    Returns:
    pd.DataFrame: Dataframe with the new 'weight' column.
    """
    # Mapping persons to their respective weights
    person_weight_map = {'AW': 60, 'AD': 75, '0': 0}

    # Insert weight column based on person mapping
    df.insert(4, 'weight', df['person_binary'].map(person_weight_map))

    # Update weight based on activity type
    df['weight'] = df.apply(lambda row: 
                            6 if row['activity'] == 'FOB' 
                            else (0.2 if row['activity'] == 'FOL' 
                                  else row['weight']), axis=1)
    
    return df

# Example usage:
df_geophone = add_weight_column(df_geophone)

# Map Person Column

In [6]:
def map_person_column(df, column_name='person_binary'):
    """
    Maps the 'person' column values to their numerical values.

    Parameters:
    - df (pd.DataFrame): Input dataframe containing 'person_binary' column.
    - column_name (str): Name of the column to be mapped.

    Returns:
    - pd.DataFrame: Dataframe with the mapped 'person' column.
    """
    df[column_name] = df[column_name].map({'AW': 1, 'AD': 1}).fillna(0).astype(int)
    return df

# Example usage:
df_geophone = map_person_column(df_geophone)

In [7]:
df_geophone.to_csv("merged_geophone_falls_raw.csv", index=False)

# Missing Values

In [16]:
df_geophone.isnull().sum().sum()

np.int64(0)

# Group By Activity

In [17]:
# group df_geophone by activity and count
df_geophone.groupby('activity').size()

activity
CD      20
FCS     60
FOB     60
FOL     60
KD      10
MA      45
RBS    120
S       45
SC      60
WBS     90
dtype: int64

# Group By Fall Binary

In [10]:
df_geophone.groupby('fall_binary').size()

fall_binary
0    510
1     60
dtype: int64

# Add Feature Columns

In [11]:
def add_feature_columns(df,sampling_interval=4.5/500):
    """
    Adds features of signal data for each row.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing signal data.

    Returns:
    - pd.DataFrame: The DataFrame with the new features.
    """
    # Signal data starts from column index 6
    signal_columns = df.columns[6:]

    # Compute values
    df['max'] = df[signal_columns].max(axis=1)
    df['min'] = df[signal_columns].min(axis=1)
    df['mean'] = df[signal_columns].mean(axis=1)
    df['std'] = df[signal_columns].std(axis=1)
    df['median'] = df[signal_columns].median(axis=1)
    df['peak'] = df[signal_columns].abs().max(axis=1)
    df['p2p'] = df['max'] - df['min']
    df['energy'] = df['peak'] * sampling_interval # Compute energy using numerical integration (Riemann sum approximation)

    return df

# Example usage:
df_geophone = add_feature_columns(df_geophone)
df_geophone.head()

Unnamed: 0,activity,fall_binary,distance_m,person_binary,weight,floor,value_1,value_2,value_3,value_4,...,value_499,value_500,max,min,mean,std,median,peak,p2p,energy
0,FCS,1,1,1,75.0,H,59,-38,10,43,...,-12,-6,3335,-2904,-12.574,332.233428,0.0,3335,6239,30.015
1,FCS,1,1,1,75.0,H,104,-49,137,-46,...,-4,-17,1794,-5425,-15.622,332.036962,-1.0,5425,7219,48.825
2,FCS,1,1,1,75.0,H,-11,48,-13,8,...,63,76,8664,-4533,22.256,556.031918,2.0,8664,13197,77.976
3,FCS,1,1,1,75.0,H,16,-11,22,4,...,4,-9,8603,-1911,19.398,467.826539,0.0,8603,10514,77.427
4,FCS,1,1,1,75.0,H,5,-11,-4,2,...,-11,-11,6241,-13943,-18.06,782.252039,1.0,13943,20184,125.487


# Features by Activity

# Get Features

In [12]:
def get_features(df: pd.DataFrame, groupby_col: str = 'activity'):
    """
    Extracts features from the given DataFrame.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the data.
    - groupby_col (str): Column to group by.

    Returns:
    - pd.DataFrame: DataFrame with extracted features.
    """
    return_df = pd.DataFrame()
    # Extract features
    return_df["recording"] = df.iloc[:, :6].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    return_df["mean"] = df.iloc[:, 6:].mean(axis=1)
    return_df["std"] = df.iloc[:, 6:].std(axis=1)
    return_df["min"] = df.iloc[:, 6:].min(axis=1)
    return_df["max"] = df.iloc[:, 6:].max(axis=1)
    return_df["median"] = df.iloc[:, 6:].median(axis=1)
    return_df["range"] = return_df["max"] - return_df["min"]

    return return_df

df_geophone_features = get_features(df_geophone)

print(df_geophone_features)

print(df_geophone.iloc[0, 6:])

            recording       mean          std      min      max  median  \
0    FCS_1_1_1_75.0_H   8.007233   496.218150  -2904.0   6239.0     0.0   
1    FCS_1_1_1_75.0_H   3.083150   578.097510  -5425.0   7219.0    -1.0   
2    FCS_1_1_1_75.0_H  74.366661   989.958696  -4533.0  13197.0     2.0   
3    FCS_1_1_1_75.0_H  71.009157   853.445499  -1911.0  10514.0     0.0   
4    FCS_1_1_1_75.0_H  35.995431  1499.926702 -13943.0  20184.0     1.0   
..                ...        ...          ...      ...      ...     ...   
565   FOB_0_1_0_6.0_H   9.818275   172.975644   -909.0   2295.0     0.0   
566   FOB_0_1_0_6.0_H  -4.050815   540.441294  -5766.0   6304.0    -0.5   
567   FOB_0_1_0_6.0_H   0.813869    28.165156   -219.0    301.0     0.0   
568   FOB_0_1_0_6.0_H  35.839606   567.292133  -1626.0   7388.0     0.0   
569   FOB_0_1_0_6.0_H   9.889884   153.832023   -927.0   2084.0     1.0   

       range  
0     9143.0  
1    12644.0  
2    17730.0  
3    12425.0  
4    34127.0  
..       

# Scatterplot

In [13]:
def plot_scatter(df, labels = False):
    """
    Creates a scatter plot with mean on the x-axis, range on the y-axis,
    and colors based on the recording category. Adds tooltips for each point.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing 'mean', 'range', and 'recording' columns.
    """
    plt.figure(figsize=(10, 6))
    scatter = sns.scatterplot(data=df, x="mean", y="range", hue="recording", palette="tab10", alpha=0.8, edgecolor="black")
    
    # Labels and title
    plt.xlabel("std")
    plt.ylabel("Range")
    plt.title("Scatter Plot of Mean vs Range Colored by Recording")
    
    # Add annotations
    if labels:
        for i in range(df.shape[0]):
            plt.annotate(df['recording'][i], (df['std'][i], df['range'][i]), fontsize=8, alpha=0.7)
    
    # Legend adjustments
    plt.legend(title="Recording", bbox_to_anchor=(1.05, 1), loc="upper left")
    
    plt.show()


In [14]:
plot_scatter(df_geophone_features, labels=True)

NameError: name 'sns' is not defined

<Figure size 1000x600 with 0 Axes>