In [1]:
# Install necessary packages
%pip install edge-ml

# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import cross_val_score, LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from edgeml import DatasetReceiver

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load the dataset using DatasetReceiver
dataset_url = "https://beta.edge-ml.org"
project_id = "6b110fec3d9b24ab616f9cd1fde80fa7"
project = DatasetReceiver(dataset_url, project_id)
project.loadData()

# Save the loaded project to a pickle file
with open("../exercises/project.pkl", "wb") as f:
    pickle.dump(project, f)

print("Data loaded and saved to pickle file.")

In [2]:
# Load the project from the pickle file
with open("../exercises/project.pkl", "rb") as f:
    project = pickle.load(f)

print("Data loaded from pickle file.")

# Concatenate datasets into a single dataframe
data_frames = []
for d in project.datasets:
    if d.name == "devicemotion" and d.data.shape[0] > 0 and (d.timeSeries[0].end - d.timeSeries[0].start) > 10000:
        df = d.data.iloc[40:-40].dropna().assign(**d.metaData)
        data_frames.append(df)
concanated_data = pd.concat(data_frames)
concanated_data

Data loaded from pickle file.


Unnamed: 0,time,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,participantId,activity,mobile,browser
40,2024-05-04 06:37:51.934,-0.3,-0.7,-2.6,-0.5,4.0,6.3,12.5,7.5,41.599998,111f7,testing,UnknownPhone,Chrome
41,2024-05-04 06:37:51.950,-0.2,-0.2,-3.0,-0.1,4.3,6.7,9.6,-0.1,36.099998,111f7,testing,UnknownPhone,Chrome
42,2024-05-04 06:37:51.967,0.3,0.3,-2.7,0.1,4.3,8.2,12.7,-4.5,28.400000,111f7,testing,UnknownPhone,Chrome
43,2024-05-04 06:37:51.984,0.5,0.2,-1.0,0.1,4.3,8.1,25.4,1.8,13.600000,111f7,testing,UnknownPhone,Chrome
44,2024-05-04 06:37:52.001,0.5,0.2,-1.0,-0.1,4.4,8.0,25.4,1.8,13.600000,111f7,testing,UnknownPhone,Chrome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2024-05-07 12:03:57.696,0.1,0.0,0.1,0.3,4.8,8.6,-3.3,3.5,-2.000000,uksco,sitting,UnknownPhone,Chrome
85,2024-05-07 12:03:57.712,0.1,0.0,0.1,0.3,4.8,8.6,-0.8,1.3,-1.400000,uksco,sitting,UnknownPhone,Chrome
86,2024-05-07 12:03:57.729,0.0,0.0,0.0,0.3,4.8,8.6,0.4,-1.3,-0.900000,uksco,sitting,UnknownPhone,Chrome
87,2024-05-07 12:03:57.746,0.0,0.0,0.0,0.3,4.7,8.5,0.2,-1.9,-0.800000,uksco,sitting,UnknownPhone,Chrome


In [22]:
# Remove testing activities
filtered_data = concanated_data[concanated_data.activity != "testing"]

# Convert categorical columns to dummies
filtered_data = pd.get_dummies(filtered_data, columns=['browser']).drop(columns=["mobile"])

# Set the index to "time" and drop the "time" column
filtered_data.set_index("time", inplace=True)

# Factorize the "activity" column and store labels
filtered_data["activity"], labels = pd.factorize(filtered_data["activity"])
filtered_data

Unnamed: 0_level_0,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,participantId,activity,browser_Chrome,browser_Firefox
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-05-04 06:39:09.013,-7.1,3.5,2.4,-0.5,-3.8,3.0,55.900002,-128.300003,-27.900000,12417,0,False,False
2024-05-04 06:39:09.030,-7.9,2.7,4.1,-1.3,-4.5,3.8,69.099998,-55.400002,0.500000,12417,0,False,False
2024-05-04 06:39:09.046,-8.6,2.0,4.3,-1.7,-4.7,3.2,59.000000,-8.300000,20.500000,12417,0,False,False
2024-05-04 06:39:09.063,-9.4,2.0,2.0,-2.8,-5.0,1.5,48.700001,15.600000,45.200001,12417,0,False,False
2024-05-04 06:39:09.080,-9.4,2.0,2.0,-2.8,-5.0,1.5,43.599998,24.799999,75.099998,12417,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-07 12:03:57.696,0.1,0.0,0.1,0.3,4.8,8.6,-3.300000,3.500000,-2.000000,uksco,2,True,False
2024-05-07 12:03:57.712,0.1,0.0,0.1,0.3,4.8,8.6,-0.800000,1.300000,-1.400000,uksco,2,True,False
2024-05-07 12:03:57.729,0.0,0.0,0.0,0.3,4.8,8.6,0.400000,-1.300000,-0.900000,uksco,2,True,False
2024-05-07 12:03:57.746,0.0,0.0,0.0,0.3,4.7,8.5,0.200000,-1.900000,-0.800000,uksco,2,True,False


In [79]:
# Define mode function
def mode(x): 
    return (x.mode()[0])

# Filter out non-numeric columns for rolling calculations
numeric_cols = filtered_data.select_dtypes(include=[np.float32]).columns
non_numeric_cols = filtered_data.select_dtypes(exclude=[np.float32]).columns
print(non_numeric_cols)

# Initialize an empty DataFrame for windowed data
windowed_data = pd.DataFrame()

# Apply rolling window for each participant separately
for participant, data in filtered_data.groupby('participantId'):
    rolled_means = data[numeric_cols].rolling('1s').mean().add_suffix('_mean')
    rolled_mins = data[numeric_cols].rolling('1s').min().add_suffix('_min')
    rolled_maxs = data[numeric_cols].rolling('1s').max().add_suffix('_max')
    rolled_vars = data[numeric_cols].rolling('1s').var().add_suffix('_var')
    rolled_medians = data[numeric_cols].rolling('1s').median().add_suffix('_median')
    rolled_chrome = data['browser_Chrome'].rolling('1s').apply(mode)
    rolled_firefox = data['browser_Firefox'].rolling('1s').apply(mode)
    rolled_activity = data['activity'].rolling('1s').apply(mode)
    
    
    # Combine mean, min, max, var, median results
    rolled_data = pd.concat([rolled_means, rolled_mins, rolled_maxs, rolled_vars, rolled_medians, rolled_chrome, rolled_firefox,rolled_activity], axis=1)
    
    # Add participantId to rolled_data
    rolled_data['participantId'] = participant
    
    # Append to windowed_data DataFrame
    windowed_data = pd.concat([windowed_data, rolled_data])

windowed_data 

Index(['participantId', 'activity', 'browser_Chrome', 'browser_Firefox'], dtype='object')


Unnamed: 0_level_0,acceleration.x_mean,acceleration.y_mean,acceleration.z_mean,accelerationIncludingGravity.x_mean,accelerationIncludingGravity.y_mean,accelerationIncludingGravity.z_mean,rotationRate.alpha_mean,rotationRate.beta_mean,rotationRate.gamma_mean,acceleration.x_min,...,accelerationIncludingGravity.x_median,accelerationIncludingGravity.y_median,accelerationIncludingGravity.z_median,rotationRate.alpha_median,rotationRate.beta_median,rotationRate.gamma_median,browser_Chrome,browser_Firefox,activity,participantId
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0,10280
2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0,10280
2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0,10280
2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.842500,0.000000,-0.030000,-0.060000,0.0,...,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0,10280
2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.844000,0.000000,-0.030000,-0.060000,0.0,...,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0,10280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-07 12:03:57.696,-0.020000,-0.006667,0.031111,0.264444,4.971111,8.353333,-5.457778,0.900000,-0.275556,-0.2,...,0.30,5.00,8.40,-4.40,1.10,-0.80,1.0,0.0,2.0,uksco
2024-05-07 12:03:57.712,-0.017391,-0.006522,0.032609,0.265217,4.967391,8.358696,-5.356522,0.908696,-0.300000,-0.2,...,0.30,5.00,8.40,-4.25,1.10,-0.85,1.0,0.0,2.0,uksco
2024-05-07 12:03:57.729,-0.017021,-0.006383,0.031915,0.265957,4.963830,8.363830,-5.234043,0.861702,-0.312766,-0.2,...,0.30,5.00,8.40,-4.10,1.10,-0.90,1.0,0.0,2.0,uksco
2024-05-07 12:03:57.746,-0.016667,-0.006250,0.031250,0.266667,4.958333,8.366667,-5.120833,0.804167,-0.322917,-0.2,...,0.30,5.00,8.45,-4.05,0.95,-0.85,1.0,0.0,2.0,uksco


In [102]:
# Group windowed_data by participantId
grouped_data = windowed_data.groupby('participantId', group_keys=True).apply(lambda x:x, include_groups=False)

In [103]:
grouped_data

Unnamed: 0_level_0,Unnamed: 1_level_0,acceleration.x_mean,acceleration.y_mean,acceleration.z_mean,accelerationIncludingGravity.x_mean,accelerationIncludingGravity.y_mean,accelerationIncludingGravity.z_mean,rotationRate.alpha_mean,rotationRate.beta_mean,rotationRate.gamma_mean,acceleration.x_min,...,acceleration.z_median,accelerationIncludingGravity.x_median,accelerationIncludingGravity.y_median,accelerationIncludingGravity.z_median,rotationRate.alpha_median,rotationRate.beta_median,rotationRate.gamma_median,browser_Chrome,browser_Firefox,activity
participantId,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10280,2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.0,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0
10280,2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.0,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0
10280,2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,0.0,...,0.0,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0
10280,2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.842500,0.000000,-0.030000,-0.060000,0.0,...,0.0,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0
10280,2024-05-06 16:05:25.388,0.000000,0.020000,0.000000,0.030000,0.030000,9.844000,0.000000,-0.030000,-0.060000,0.0,...,0.0,0.03,0.03,9.84,0.00,-0.03,-0.06,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uksco,2024-05-07 12:03:57.696,-0.020000,-0.006667,0.031111,0.264444,4.971111,8.353333,-5.457778,0.900000,-0.275556,-0.2,...,0.0,0.30,5.00,8.40,-4.40,1.10,-0.80,1.0,0.0,2.0
uksco,2024-05-07 12:03:57.712,-0.017391,-0.006522,0.032609,0.265217,4.967391,8.358696,-5.356522,0.908696,-0.300000,-0.2,...,0.0,0.30,5.00,8.40,-4.25,1.10,-0.85,1.0,0.0,2.0
uksco,2024-05-07 12:03:57.729,-0.017021,-0.006383,0.031915,0.265957,4.963830,8.363830,-5.234043,0.861702,-0.312766,-0.2,...,0.0,0.30,5.00,8.40,-4.10,1.10,-0.90,1.0,0.0,2.0
uksco,2024-05-07 12:03:57.746,-0.016667,-0.006250,0.031250,0.266667,4.958333,8.366667,-5.120833,0.804167,-0.322917,-0.2,...,0.0,0.30,5.00,8.45,-4.05,0.95,-0.85,1.0,0.0,2.0


In [106]:
# Prepare the dataframe for modeling
grouped_data_copy=grouped_data.copy()
grouped_data_copy.index=grouped_data.index.droplevel(1)
grouped_data_copy=grouped_data_copy.reset_index()
grouped_data_copy.dropna(inplace=True)
grouped_data_copy

Unnamed: 0,participantId,acceleration.x_mean,acceleration.y_mean,acceleration.z_mean,accelerationIncludingGravity.x_mean,accelerationIncludingGravity.y_mean,accelerationIncludingGravity.z_mean,rotationRate.alpha_mean,rotationRate.beta_mean,rotationRate.gamma_mean,...,acceleration.z_median,accelerationIncludingGravity.x_median,accelerationIncludingGravity.y_median,accelerationIncludingGravity.z_median,rotationRate.alpha_median,rotationRate.beta_median,rotationRate.gamma_median,browser_Chrome,browser_Firefox,activity
1,10280,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,...,0.0,0.03,0.03,9.840,0.00,-0.03,-0.06,0.0,1.0,2.0
2,10280,0.000000,0.020000,0.000000,0.030000,0.030000,9.840000,0.000000,-0.030000,-0.060000,...,0.0,0.03,0.03,9.840,0.00,-0.03,-0.06,0.0,1.0,2.0
3,10280,0.000000,0.020000,0.000000,0.030000,0.030000,9.842500,0.000000,-0.030000,-0.060000,...,0.0,0.03,0.03,9.840,0.00,-0.03,-0.06,0.0,1.0,2.0
4,10280,0.000000,0.020000,0.000000,0.030000,0.030000,9.844000,0.000000,-0.030000,-0.060000,...,0.0,0.03,0.03,9.840,0.00,-0.03,-0.06,0.0,1.0,2.0
5,10280,0.000000,0.020000,0.000000,0.030000,0.030000,9.845000,0.000000,-0.030000,-0.060000,...,0.0,0.03,0.03,9.845,0.00,-0.03,-0.06,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121798,uksco,-0.020000,-0.006667,0.031111,0.264444,4.971111,8.353333,-5.457778,0.900000,-0.275556,...,0.0,0.30,5.00,8.400,-4.40,1.10,-0.80,1.0,0.0,2.0
121799,uksco,-0.017391,-0.006522,0.032609,0.265217,4.967391,8.358696,-5.356522,0.908696,-0.300000,...,0.0,0.30,5.00,8.400,-4.25,1.10,-0.85,1.0,0.0,2.0
121800,uksco,-0.017021,-0.006383,0.031915,0.265957,4.963830,8.363830,-5.234043,0.861702,-0.312766,...,0.0,0.30,5.00,8.400,-4.10,1.10,-0.90,1.0,0.0,2.0
121801,uksco,-0.016667,-0.006250,0.031250,0.266667,4.958333,8.366667,-5.120833,0.804167,-0.322917,...,0.0,0.30,5.00,8.450,-4.05,0.95,-0.85,1.0,0.0,2.0


In [113]:
# Sample data for visualization
sampled_data = grouped_data_copy.sample(1000)
sns.pairplot(sampled_data, hue='activity')
plt.show()

KeyboardInterrupt: 

Error in callback <function flush_figures at 0x7fb95e94e840> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

In [108]:
# Define the classifier
clf = RandomForestClassifier(random_state=0)

# Prepare features and labels for cross-validation and RFECV
X = sampled_data.drop(['activity', 'participantId'], axis=1)
y = sampled_data['activity']
groups = sampled_data['participantId']

# Perform cross-validation using Leave-One-Group-Out
scores = cross_val_score(clf, X, y, cv=LeaveOneGroupOut(), groups=groups, scoring="f1_macro")
print("Mean F1 Score from cross-validation: ", scores.mean())

# Perform Recursive Feature Elimination with Cross-Validation (RFECV)
rfecv = RFECV(estimator=clf, step=1, scoring='f1_macro')
rfecv = rfecv.fit(X, y)

Mean F1 Score from cross-validation:  0.547670506488007


In [None]:
# Plot RFECV results
cv_results = pd.DataFrame(rfecv.cv_results_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    x=cv_results.index,
    y=cv_results["mean_test_score"],
    yerr=cv_results["std_test_score"],
)
plt.title("Recursive Feature Elimination")
plt.show()

# Prepare the final result dataframe with selected features
selected_features = rfecv.get_feature_names_out(X.columns)
result_data = pd.concat([
    grouped_data_copy[['participantId']],
    pd.DataFrame(labels[grouped_data_copy.activity.astype(int)], columns=["activity"]),
    grouped_data_copy[selected_features]
], axis=1).dropna()

# Plot the activity distribution
result_data.activity.hist()
plt.title("Activity Distribution")
plt.xlabel("Activity")
plt.ylabel("Frequency")
plt.show()

# Save the final features to a pickle file
with open("../exercises/features.pkl", "wb") as f:
    pickle.dump(result_data, f)

print("Features saved to pickle file.")

In [112]:
result_data

Unnamed: 0,participantId,activity,acceleration.y_mean,accelerationIncludingGravity.x_mean,accelerationIncludingGravity.y_mean,accelerationIncludingGravity.z_mean,rotationRate.alpha_mean,acceleration.y_min,accelerationIncludingGravity.x_min,accelerationIncludingGravity.y_min,...,accelerationIncludingGravity.y_var,accelerationIncludingGravity.z_var,rotationRate.alpha_var,rotationRate.beta_var,rotationRate.gamma_var,acceleration.y_median,accelerationIncludingGravity.x_median,accelerationIncludingGravity.y_median,accelerationIncludingGravity.z_median,rotationRate.alpha_median
1,10280,sitting,0.020000,0.030000,0.030000,9.840000,0.000000,0.02,0.03,0.03,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.02,0.030000,0.03,9.840,0.000000
2,10280,sitting,0.020000,0.030000,0.030000,9.840000,0.000000,0.02,0.03,0.03,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.02,0.030000,0.03,9.840,0.000000
3,10280,sitting,0.020000,0.030000,0.030000,9.842500,0.000000,0.02,0.03,0.03,...,0.000000,0.000025,0.000000,0.000000,0.000000,0.02,0.030000,0.03,9.840,0.000000
4,10280,sitting,0.020000,0.030000,0.030000,9.844000,0.000000,0.02,0.03,0.03,...,0.000000,0.000030,0.000000,0.000000,0.000000,0.02,0.030000,0.03,9.840,0.000000
5,10280,sitting,0.020000,0.030000,0.030000,9.845000,0.000000,0.02,0.03,0.03,...,0.000000,0.000030,0.000000,0.000000,0.000000,0.02,0.030000,0.03,9.845,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121717,uksco,sitting,1.140000,17.680000,-1.930000,-4.710000,147.420000,-2.70,12.80,-3.30,...,3.013445,99.938769,1655.137330,8818.709404,8896.485613,0.90,18.900001,-2.30,-1.700,144.099998
121718,uksco,sitting,1.118182,17.263637,-1.863636,-5.372727,115.300001,-2.70,12.80,-3.30,...,2.760546,94.776174,12838.261634,9344.820570,9853.118897,0.90,18.600000,-2.30,-2.100,142.500000
121719,uksco,sitting,1.150000,16.675000,-1.908333,-4.991667,94.100000,-2.70,10.20,-3.30,...,2.533561,87.902644,17064.427240,13181.429436,10292.051910,0.95,17.150000,-2.30,-1.700,130.650002
121720,uksco,sitting,1.046154,15.984616,-2.061538,-4.407692,81.446154,-2.70,7.70,-3.90,...,2.627564,85.010762,17723.949377,14116.102395,10373.772592,0.90,15.700000,-2.30,-1.300,118.800003
