In [41]:
# import libraries
import os
import time as T
import pandas as pd
import numpy as np
import tsfel

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler 

# groq
from groq import Groq
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY_1"), # stored API key in virtual environment (not going to GitHub)
)

# Question 1

## Decision Tree on TSFEL features

### MakeDataset

In [42]:
time = 10 # Setting the time window for each sample
offset = 100 # Skipping the first 100 rows to remove noise
folders = ["LAYING","SITTING","STANDING","WALKING","WALKING_DOWNSTAIRS","WALKING_UPSTAIRS"]
classes = {"WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6}

combined_dir = os.path.join("Combined")

X_train=[]
y_train=[]
dataset_dir = os.path.join(combined_dir,"Train")

for folder in folders:
    files = os.listdir(os.path.join(dataset_dir,folder))

    for file in files:

        df = pd.read_csv(os.path.join(dataset_dir,folder,file),sep=",",header=0)
        df = df[offset:offset+time*50]
        X_train.append(df.values)
        y_train.append(classes[folder])

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test=[]
y_test=[]
dataset_dir = os.path.join(combined_dir,"Test")

for folder in folders:
    files = os.listdir(os.path.join(dataset_dir,folder))
    for file in files:

        df = pd.read_csv(os.path.join(dataset_dir,folder,file),sep=",",header=0)
        df = df[offset:offset+time*50]
        X_test.append(df.values)
        y_test.append(classes[folder])

X_test = np.array(X_test)
y_test = np.array(y_test)

# USE THE BELOW GIVEN DATA FOR TRAINING and TESTING purposes

# concatenate the training and testing data
X = np.concatenate((X_train,X_test))
y = np.concatenate((y_train,y_test))

# split the data into training and testing sets. Change the seed value to obtain different random splits.
seed = 4
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=seed,stratify=y)

print("Training data shape: ", X_train.shape) # (number of samples, number of time steps (x (=10) sec * 50Hz), number of features)
print("Testing data shape: ", X_test.shape)
print("Training labels shape: ", y_train.shape)
print("Testing labels shape: ", y_test.shape)

Training data shape:  (126, 500, 3)
Testing data shape:  (54, 500, 3)
Training labels shape:  (126,)
Testing labels shape:  (54,)


### TSFEL feature extraction and processing

In [None]:
# making a list of of pandas dataframes corresponding to each sample
X_train_dfs = [pd.DataFrame(sample, columns=['accx', 'accy', 'accz']) for sample in X_train]
X_test_dfs = [pd.DataFrame(sample, columns=['accx', 'accy', 'accz']) for sample in X_test]

X_train_dfs = [df.apply(lambda x: np.sqrt(x['accx']**2 + x['accy']**2 + x['accz']**2), axis=1) for df in X_train_dfs]
X_test_dfs = [df.apply(lambda x: np.sqrt(x['accx']**2 + x['accy']**2 + x['accz']**2), axis=1) for df in X_test_dfs]

# consider all features
cfg_file = tsfel.get_features_by_domain()  

# get list of feature vectors for each dataframe (or sample)           
# choosing `fs=50` because the data was sampled at 50Hz                              
X_train_tsfel_dfs = [tsfel.time_series_features_extractor(cfg_file, df, fs=50) for df in X_train_dfs]
X_train_tsfel = pd.concat(X_train_tsfel_dfs, axis=0).fillna(0).values

X_test_tsfel_dfs = [tsfel.time_series_features_extractor(cfg_file, df, fs=50) for df in X_test_dfs]
X_test_tsfel = pd.concat(X_test_tsfel_dfs, axis=0).fillna(0).values

# we leave out removing constant columns and highly correlated features as the custom data may not have the same features as the provided dataset

### Testing

In [44]:
dtc_tsfel = DecisionTreeClassifier()
dtc_tsfel.fit(X_train_tsfel, y_train)
y_pred_tsfel = dtc_tsfel.predict(X_test_tsfel)
acc_tsfel = accuracy_score(y_test, y_pred_tsfel)

# Since, this model is using the provided dataset which is balanced, we will use macro average for precision and recall
prec_tsfel = precision_score(y_test, y_pred_tsfel, average='macro')
rec_tsfel = recall_score(y_test, y_pred_tsfel, average='macro')

conf_mx_tsfel = confusion_matrix(y_test, y_pred_tsfel)

print("TSFEL Accuracy:", acc_tsfel)
print("TSFEL Precision:", prec_tsfel)
print("TSFEL Recall:", rec_tsfel)
print()
print("TSFEL Confusion Matrix:")
print(conf_mx_tsfel)

TSFEL Accuracy: 0.6666666666666666
TSFEL Precision: 0.6554843304843304
TSFEL Recall: 0.6666666666666666

TSFEL Confusion Matrix:
[[7 0 1 0 1 0]
 [2 4 2 0 1 0]
 [0 2 7 0 0 0]
 [0 0 0 2 1 6]
 [0 0 0 0 9 0]
 [0 0 0 2 0 7]]


## Decision Tree on Our Data

### Our Data Preprocessing

In [45]:
offset = 125

custom_data_dir = os.path.join("Krustykrabs")

X_custom_data=[]
y_custom_data=[]

for folder in folders:
    files = os.listdir(os.path.join(custom_data_dir,folder))

    for file in files:
        if file == ".DS_Store":
            continue

        df = pd.read_csv(os.path.join(custom_data_dir,folder,file),sep=",",header=0)
        df.drop(["time", "TgF"], axis=1, inplace=True)

        df = df[offset:offset+time*50]

        for col in df.columns:
            df[col] = (2 * (df[col] - df[col].min()) / (df[col].max() - df[col].min())) - 1

        X_custom_data.append(df.values)
        y_custom_data.append(classes[folder])


seed = 4
X_train_custom_data, X_test_custom_data, y_train_custom_data, y_test_custom_data = train_test_split(X_custom_data,y_custom_data,test_size=0.3,random_state=seed,stratify=y_custom_data)

In [None]:
X_train_dfs_custom_data = [pd.DataFrame(sample, columns=['accx', 'accy', 'accz']) for sample in X_train_custom_data]
X_test_dfs_custom_data = [pd.DataFrame(sample, columns=['accx', 'accy', 'accz']) for sample in X_test_custom_data]

X_train_dfs_custom_data = [df.apply(lambda x: np.sqrt(x['accx']**2 + x['accy']**2 + x['accz']**2), axis=1) for df in X_train_dfs_custom_data]
X_test_dfs_custom_data = [df.apply(lambda x: np.sqrt(x['accx']**2 + x['accy']**2 + x['accz']**2), axis=1) for df in X_test_dfs_custom_data]

tsfel_config_custom_data = tsfel.get_features_by_domain()

X_train_tsfel_dfs_custom_data = [tsfel.time_series_features_extractor(tsfel_config_custom_data, df, fs=50) for df in X_train_dfs_custom_data]
X_train_tsfel_custom_data = pd.concat(X_train_tsfel_dfs_custom_data, axis=0).fillna(0).values

X_test_tsfel_dfs_custom_data = [tsfel.time_series_features_extractor(tsfel_config_custom_data, df, fs=50) for df in X_test_dfs_custom_data]
X_test_tsfel_custom_data = pd.concat(X_test_tsfel_dfs_custom_data, axis=0).fillna(0).values

# we leave out removing constant columns and highly correlated features as the custom data may not have the same features as the provided dataset

### Previously Trained Decision Tree on Our Data

In [47]:
y_pred_tsfel_custom_data = dtc_tsfel.predict(X_test_tsfel_custom_data)
acc_tsfel_custom_data = accuracy_score(y_test_custom_data, y_pred_tsfel_custom_data)

# Since, this model is using the provided dataset which is balanced, we will use macro average for precision and recall
prec_tsfel_custom_data = precision_score(y_test_custom_data, y_pred_tsfel_custom_data, average='macro')
rec_tsfel_custom_data = recall_score(y_test_custom_data, y_pred_tsfel_custom_data, average='macro')

conf_mx_tsfel_custom_data = confusion_matrix(y_test_custom_data, y_pred_tsfel_custom_data)

print("TSFEL Accuracy:", acc_tsfel_custom_data)
print("TSFEL Precision:", prec_tsfel_custom_data)
print("TSFEL Recall:", rec_tsfel_custom_data)
print()
print("TSFEL Confusion Matrix:")
print(conf_mx_tsfel_custom_data)

TSFEL Accuracy: 0.2222222222222222
TSFEL Precision: 0.047619047619047616
TSFEL Recall: 0.16666666666666666

TSFEL Confusion Matrix:
[[0 0 2 0 0 0]
 [0 0 2 0 0 0]
 [0 0 2 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The sensor signals from the UCI-HAR were pre-processed by applying noise filters. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. But for our data we are not using any sort of filtering. 

Also, the walking speed of people in the west is generally higher than us. The model trained on the UCI-HAR dataset may not work well on our data because of such differences in gait speed.

# Question 2

## Data Preprocessing and Feautrization

In [None]:
# Create a list to store the dataframes
dataframes = []

time = 10
offset = 125

custom_data_dir = os.path.join("Krustykrabs")

subject_name = {1: "Tejas", 2: "Devansh", 3: "Mohit", 4: "Devansh", 5: "Mohit"}

cfg_file = tsfel.get_features_by_domain() 

for folder in folders:
    files = os.listdir(os.path.join(custom_data_dir,folder))

    for file in files:
        if file == ".DS_Store":
            continue
        df = pd.read_csv(os.path.join(custom_data_dir,folder,file),sep=",",header=0)
        df.drop(["time", "TgF"], axis=1, inplace=True)
        df = df[offset:offset+time*50]
        for col in df.columns:
            df[col] = (2 * (df[col] - df[col].min()) / (df[col].max() - df[col].min())) - 1
        df=tsfel.time_series_features_extractor(cfg_file, df, fs=50)
        df["subject_id"] = subject_name[int(file[8])]
        df["activity"] = classes[folder]
        dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(dataframes, ignore_index=True)

# remove columns where the feature is constant throughout all samples
for col in data.columns:
    if len(data[col].unique()) == 1:
        data.drop(col, axis=1, inplace=True)

# remove highly correlated features (columns)
corr = data.iloc[:, [data.columns.get_loc(col) for col in data.columns if col not in ['subject_id', 'activity']]].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
tri_df = corr.mask(mask)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.9)] # threshold = 0.9
data.drop(columns=to_drop, inplace=True)

# shuffle data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

## Leave One Subject Out Cross Validation

In [49]:
all_subjects = data['subject_id'].unique()
all_subjects

array(['Devansh', 'Mohit', 'Tejas'], dtype=object)

In [50]:
#create an empty list to store accuracy in  
accuracy_list = []

Iterate over all subject IDs in the all_subjects list and follow this procedure:

- For each iteration, select only the observations containing the subject ID as the test dataset, and select all other observations as the training set. The labels for the test/train split are selected in the same way. For the data, the subject_ID, and activity must be dropped.
- Fit the model and calculate the accuracy.
- Store the accuracy in the empty list created above

In [81]:
#loop over all conditions  
for idx, subject_id in enumerate(all_subjects):
    dtc_losocv = DecisionTreeClassifier()

    # assign testing and training data  
    x_train_losocv = data.loc[data['subject_id'] != subject_id].drop(['subject_id', 'activity'], axis=1)
    x_test_losocv = data.loc[data['subject_id'] == subject_id].drop(['subject_id', 'activity'], axis=1)
    y_train_losocv = data.loc[data['subject_id'] != subject_id]['activity']
    y_test_losocv = data.loc[data['subject_id'] == subject_id]['activity']
      
    # fit model 
    dtc_losocv.fit(x_train_losocv, y_train_losocv)  
    y_pred_losocv = dtc_losocv.predict(x_test_losocv)
    # append max accuracy over all epochs  
    accuracy_list.append(accuracy_score(y_test_losocv, y_pred_losocv))
    print(accuracy_list[-1])
      
# get average of accuracies over all cross folds  
mean_accuracy = np.mean(np.array(accuracy_list)) 

print('Average Accuracy: ', mean_accuracy)

0.25
0.6666666666666666
0.5
Average Accuracy:  0.4290123456790123


# Question 3

## Checking Data

In [52]:
X_train_tsfel.shape, X_test_tsfel.shape, X_train_tsfel_custom_data.shape, X_test_tsfel_custom_data.shape

((126, 384), (54, 384), (21, 384), (9, 384))

In [53]:
len(y_train), len(y_test), len(y_train_custom_data), len(y_test_custom_data)

(126, 54, 21, 9)

## Few Shot
Examples are taken from UCI-HAR and the LLM is asked to predict on our data.

In [54]:
label = {1: "WALKING", 2: "WALKING_UPSTAIRS", 3: "WALKING_DOWNSTAIRS", 4: "SITTING", 5: "STANDING", 6: "LAYING"}

In [55]:
y_few_shot_pred_tup = []
for i in range(9):
    query = f"*Your task is to classify the activity performed by the user based on the provided featurized accelerometer data. \n* You will be given a TSFEL feature vector of size 384. \n* There are six possible activities - 1: WALKING, 2: WALKING_UPSTAIRS, 3: WALKING_DOWNSTAIRS, 4: SITTING, 5: STANDING, 6: LAYING.\n* Please provide the most likely activity as a single integer corresponding to the activity.\n Here are few examples with feature vectors and what activity the correspond to:"
    # giving it random 10 examples from the UCI-HAR training data
    for j in np.random.randint(0, 126, 10):
        query+=f"{j+1}.\n"
        query+=f"Feature vector = {[float(x) for x in X_train_tsfel[j]]}\n"
        query+=f"Activity = {y_train[j]}: {label[y_train[j]]}\n"
    # asking about an example from the custom data
    query+=f"\nWhat is this activity: {[float(x) for x in X_test_tsfel_custom_data[i]]}?"
    chat_completion = client.chat.completions.create(
        messages=[
            # Set an optional system message. This sets the behavior of the
            # assistant and can be used to provide specific instructions for
            # how it should behave throughout the conversation.
            {
                "role": "system",
                "content": f"You are an activity classification model. You'll be given a TSFEL feature vector of size 384. Keep responses in the following format: 1: WALKING, 2: WALKING_UPSTAIRS, 3: WALKING_DOWNSTAIRS, 4: SITTING, 5: STANDING, 6: LAYING. You should output a single integer corresponding to the activity label."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": query,
            }
        ],
        # The language model which will generate the completion.
        model="llama-3.1-70b-versatile",
        # optional parameters

        # Controls randomness: lowering results in less random completions.
        temperature=0,
    )
    T.sleep(2)
    # append the completion returned by the LLM to y_pred
    y_few_shot_pred_tup.append((i,chat_completion.choices[0].message.content))
    print(y_few_shot_pred_tup[-1])

(0, '4: SITTING')
(1, '3: WALKING_DOWNSTAIRS')
(2, '1: WALKING')
(3, '1: WALKING')
(4, '1: WALKING')
(5, '4: SITTING')
(6, '1: WALKING')
(7, '1: WALKING')
(8, '1: WALKING')


In [56]:
y_few_shot_pred = [int(x[1][0]) for x in y_few_shot_pred_tup]

In [57]:
accuracy_score(y_few_shot_pred, y_test_custom_data)

0.1111111111111111

Very bad performance since this is an LLM model and our data is very different from the UCI-HAR dataset. Also we're not telling the LLM the names of the feature so as to give it an hint of what the numbers mean.

# Question 4

## Data preprocessing

In [58]:
len(X_train_tsfel_dfs_custom_data[0].columns)

384

In [59]:
# merge the list of dataframes into a single dataframe
col = X_train_tsfel_dfs_custom_data[0].columns
X_train_tsfel_df_custom_data = pd.DataFrame(X_train_tsfel_custom_data, columns = col)
X_test_tsfel_df_custom_data = pd.DataFrame(X_test_tsfel_custom_data, columns = col)

# do the following for the training data and then choose remaining columns from the test data
# remove columns where the feature is constant throughout all samples
for col in X_train_tsfel_df_custom_data.columns:
    if len(X_train_tsfel_df_custom_data[col].unique()) == 1:
        X_train_tsfel_df_custom_data.drop(col, axis=1, inplace=True)

# remove highly correlated features (columns) from the training data
corr = X_train_tsfel_df_custom_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
tri_df = corr.mask(mask)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.9)] # threshold = 0.9
X_train_tsfel_df_custom_data.drop(columns=to_drop, inplace=True)

# remove the same columns from the test data
X_test_tsfel_df_custom_data = X_test_tsfel_df_custom_data[X_train_tsfel_df_custom_data.columns] 

# Filter columns starting with '0_FFT mean coefficient_'
filtered_cols = [col for col in X_train_tsfel_df_custom_data.columns if col.startswith('0_FFT mean coefficient_')]
# Calculate the sum of squares for the filtered columns to get energy
X_train_tsfel_df_custom_data['0_Energy'] = X_train_tsfel_df_custom_data[filtered_cols].pow(2).sum(axis=1)
X_test_tsfel_df_custom_data['0_Energy'] = X_test_tsfel_df_custom_data[filtered_cols].pow(2).sum(axis=1)
# Drop the filtered columns
X_train_tsfel_df_custom_data.drop(filtered_cols, axis=1, inplace=True)
X_test_tsfel_df_custom_data.drop(filtered_cols, axis=1, inplace=True)

# feature selection
features = ['0_Entropy', '0_Fundamental frequency',
       '0_Human range energy', '0_Kurtosis', '0_Max',
       '0_Max power spectrum', '0_Mean diff', '0_Median diff', '0_Min',
       '0_Neighbourhood peaks', '0_Peak to peak distance',
       '0_Positive turning points', '0_Power bandwidth', '0_Root mean square',
       '0_Skewness', '0_Slope', '0_Spectral distance',
       '0_Spectral entropy', '0_Spectral positive turning points',
       '0_Wavelet absolute mean_8', '0_Wavelet entropy', '0_Wavelet variance_8', '0_Energy']

X_train_tsfel_df_custom_data = X_train_tsfel_df_custom_data[features]
X_test_tsfel_df_custom_data = X_test_tsfel_df_custom_data[features]

## Few Shot
Examples are taken from our data and the LLM is asked to predict on our data.

In [60]:
X_train_tsfel_df_custom_data.shape, X_test_tsfel_df_custom_data.shape, len(y_train_custom_data), len(y_test_custom_data)

((21, 23), (9, 23), 21, 9)

In [61]:
y_few_shot_pred_tup = []
for i in range(9):
    query = f"*Your task is to classify the activity performed by the user based on the provided featurized accelerometer data.\n* The features are: {features}. \n* You are given values corresponding to the features in order. \n* There are six possible activities - 1: WALKING, 2: WALKING_UPSTAIRS, 3: WALKING_DOWNSTAIRS, 4: SITTING, 5: STANDING, 6: LAYING.\n* Please provide the most likely activity as a single integer corresponding to the activity.\n Here are few examples with values corresponding to the given features:"
    # giving it random 10 examples from the training data
    for j in np.random.randint(0, 21, 10):
        query+=f"{j+1}.\n"
        query+=f"Feature vector = {list(X_train_tsfel_df_custom_data.loc[j].to_dict().values())}\n"
        query+=f"Activity = {y_train[j]}: {label[y_train[j]]}\n"
    query+=f"\nWhat is this activity: {list(X_test_tsfel_df_custom_data.loc[i].to_dict().values())}?"
    chat_completion = client.chat.completions.create(
        messages=[
            # Set an optional system message. This sets the behavior of the
            # assistant and can be used to provide specific instructions for
            # how it should behave throughout the conversation.
            {
                "role": "system",
                "content": f"You are an activity classification model. Keep responses in the following format: 1: WALKING, 2: WALKING_UPSTAIRS, 3: WALKING_DOWNSTAIRS, 4: SITTING, 5: STANDING, 6: LAYING. You should output a single integer corresponding to the activity label."
            },
            # Set a user message for the assistant to respond to.
            {
                "role": "user",
                "content": query,
            }
        ],
        # The language model which will generate the completion.
        model="llama-3.1-70b-versatile",
        # optional parameters

        # Controls randomness: lowering results in less random completions.
        temperature=0,
    )
    T.sleep(2)
    # append the completion returned by the LLM to y_pred
    y_few_shot_pred_tup.append((i,chat_completion.choices[0].message.content))
    print(y_few_shot_pred_tup[-1])

(0, '1: WALKING')
(1, '3: WALKING_DOWNSTAIRS')
(2, '4: SITTING')
(3, '1: WALKING')
(4, '4: SITTING')
(5, '1: WALKING')
(6, '1: WALKING')
(7, '1: WALKING')
(8, '1: WALKING')


In [62]:
y_few_shot_pred = [int(x[1][0]) for x in y_few_shot_pred_tup]

In [63]:
y_few_shot_pred

[1, 3, 4, 1, 4, 1, 1, 1, 1]

In [64]:
accuracy_score(y_few_shot_pred, y_test_custom_data)

0.2222222222222222

The model performance is better than random.