In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from scipy.signal import welch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import read_data as rd  # Importing the read_data.py module
from scipy.fft import fft
from scipy.stats import skew, kurtosis

In [2]:
# Function to normalize the data using min-max scaling
def normalize_data(data):
    min_val = np.min(data, axis=0)
    max_val = np.max(data, axis=0)
    normalized = (data - min_val) / (max_val - min_val)
    return normalized

def extract_time_domain_features(signal):
    features = []
    for i in range(signal.shape[1]):
        sig = signal[:, i]
        features.extend([
            np.mean(sig),               # Mean
            np.std(sig),                # Standard Deviation
            skew(sig),                  # Skewness
            kurtosis(sig),              # Kurtosis
            np.max(sig),                # Maximum
            np.min(sig),                # Minimum
            np.ptp(sig),                # Peak-to-Peak
            np.sqrt(np.mean(sig**2)),   # RMS
            np.sum(np.abs(np.diff(np.sign(sig)))) / 2,  # Zero Crossing Rate
            np.sum(sig**2)              # Energy
        ])
    return features

def extract_fft_features(signal, fs, top_n=50):
    N = len(signal)
    T = 1.0 / fs
    yf = fft(signal)
    xf = np.fft.fftfreq(N, T)[:N//2]
    yf = 2.0/N * np.abs(yf[:N//2])
    
    # Get indices of the top_n highest frequencies
    top_indices = np.argsort(yf)[-top_n:]
    
    # Extract the top_n highest FFT features
    fft_features = yf[top_indices]
    return fft_features

def extract_frequency_domain_features(signal, fs):
    features = []
    for i in range(signal.shape[1]):
        sig = signal[:, i]
        freqs, psd = welch(sig, fs)
        
        features.extend([
            np.mean(psd),                   # Mean Power Spectral Density
            np.sum(psd),                    # Total Power
            np.argmax(psd),                 # Peak Frequency
            np.mean(freqs * psd) / np.mean(psd),  # Spectral Centroid
            np.sqrt(np.mean((freqs - np.mean(freqs))**2 * psd)) / np.mean(psd),  # Spectral Bandwidth
            np.percentile(psd, 75) - np.percentile(psd, 25),  # Spectral Contrast
            np.max(freqs[np.cumsum(psd) / np.sum(psd) <= 0.85])  # Spectral Roll-off
        ])
        
    return features

In [3]:
# Load data
dataset_dir = '/home/ecappiell/datasets/full'
data_arrays, labels, class_ids = rd.process_mafaulda_data(dataset_dir)

In [4]:
# Original sampling rate (in Hz)
original_sampling_rate = 50 * 10**3  # 50 kHz

# Target sampling rate (in Hz)
target_sampling_rate = 2 * 10**3  # 2 kHz

# Downsample the data
downsampled_data = rd.downsample_data(data_arrays, original_sampling_rate, target_sampling_rate)

In [5]:
# Normalize the downsampled d ata
normalized_data = np.array([normalize_data(signal) for signal in downsampled_data])

In [6]:
# Extract features for each signal
X = []
for signal in normalized_data:
    time_features = extract_time_domain_features(signal)
    fft_features = [extract_fft_features(signal[:, i], target_sampling_rate) for i in range(signal.shape[1])]
    freq_features = extract_frequency_domain_features(signal, target_sampling_rate)
    signal_features = np.concatenate((time_features, np.hstack(fft_features), freq_features))
    X.append(signal_features)
X = np.array(X)

In [7]:
# Assuming labels and class_ids are prepared for classification
y = np.array(class_ids)

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
# Normalize the feature matrix using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025,probability=True),
    SVC(gamma=2, C=1, probability=True),
    GaussianProcessClassifier(),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

classifier_names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Naive Bayes",
    "QDA"]

# Train and evaluate each classifier
for name, clf in zip(classifier_names, classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Classifier: {name}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)

Classifier: Nearest Neighbors
              precision    recall  f1-score   support

           0       0.69      0.92      0.79        59
           1       0.92      0.95      0.94       100
           2       0.90      0.60      0.72        15
           3       0.95      0.94      0.94       154
           4       0.99      0.91      0.95       168
           5       0.92      0.91      0.92        90

    accuracy                           0.92       586
   macro avg       0.90      0.87      0.88       586
weighted avg       0.93      0.92      0.92       586

Confusion Matrix:
[[ 54   1   0   1   1   2]
 [  2  95   0   1   0   2]
 [  2   1   9   0   0   3]
 [  9   0   0 144   1   0]
 [  5   6   0   4 153   0]
 [  6   0   1   1   0  82]]
Classifier: Linear SVM
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        59
           1       0.96      0.98      0.97       100
           2       0.86      0.80      0.83        15
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classifier: Gaussian Process
              precision    recall  f1-score   support

           0       1.00      0.02      0.03        59
           1       0.00      0.00      0.00       100
           2       0.00      0.00      0.00        15
           3       1.00      0.01      0.01       154
           4       1.00      0.01      0.02       168
           5       0.15      1.00      0.27        90

    accuracy                           0.16       586
   macro avg       0.53      0.17      0.06       586
weighted avg       0.67      0.16      0.05       586

Confusion Matrix:
[[  1   0   0   0   0  58]
 [  0   0   0   0   0 100]
 [  0   0   0   0   0  15]
 [  0   0   0   1   0 153]
 [  0   0   0   0   2 166]
 [  0   0   0   0   0  90]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classifier: Decision Tree
              precision    recall  f1-score   support

           0       0.66      0.86      0.75        59
           1       0.94      0.84      0.89       100
           2       0.50      0.13      0.21        15
           3       0.85      0.78      0.81       154
           4       0.75      0.86      0.80       168
           5       0.84      0.76      0.80        90

    accuracy                           0.80       586
   macro avg       0.76      0.71      0.71       586
weighted avg       0.81      0.80      0.80       586

Confusion Matrix:
[[ 51   0   0   4   1   3]
 [  0  84   0   0  12   4]
 [  1   0   2   0   9   3]
 [  6   4   1 120  22   1]
 [ 11   1   0   9 145   2]
 [  8   0   1   9   4  68]]
Classifier: Random Forest
              precision    recall  f1-score   support

           0       0.67      0.10      0.18        59
           1       0.69      0.67      0.68       100
           2       0.00      0.00      0.00        15
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classifier: AdaBoost
              precision    recall  f1-score   support

           0       0.62      0.68      0.65        59
           1       0.50      0.89      0.64       100
           2       0.38      0.20      0.26        15
           3       0.71      0.55      0.62       154
           4       0.66      0.51      0.58       168
           5       0.76      0.73      0.75        90

    accuracy                           0.63       586
   macro avg       0.60      0.59      0.58       586
weighted avg       0.65      0.63      0.62       586

Confusion Matrix:
[[40  0  0  1  5 13]
 [ 0 89  2  6  3  0]
 [ 0  1  3  2  8  1]
 [ 1 41  1 84 21  6]
 [ 9 46  2 24 86  1]
 [14  0  0  2  8 66]]
Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.35      0.69      0.46        59
           1       0.83      0.86      0.85       100
           2       0.35      0.73      0.48        15
           3       0.82      0.63      0.71       

