In [4]:
%pip install edge-ml

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting edge-ml
  Downloading edge_ml-0.3.3-py3-none-any.whl.metadata (6.7 kB)
Downloading edge_ml-0.3.3-py3-none-any.whl (7.6 kB)
Installing collected packages: edge-ml
Successfully installed edge-ml-0.3.3
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import necessary libraries
from edgeml import DatasetReceiver
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

In [3]:
# Load the dataset using DatasetReceiver
project=DatasetReceiver("https://beta.edge-ml.org","6b110fec3d9b24ab616f9cd1fde80fa7")
project.loadData()

In [6]:
# Save the loaded project to a pickle file
pickle.dump(project, open("../exercises/project.pkl","wb"))
print("DONE DUMPING")

NameError: name 'project' is not defined

In [7]:
# Load the project from the pickle file
project=pickle.load(open("../exercises/project.pkl","rb"))
print("DONE LOADING")

DONE LOADING


In [8]:
# Concatenate the datasets into a single dataframe
concanated_data=pd.concat([d.data.iloc[40:-40].dropna().assign(**d.metaData) for d  in project.datasets if (d.name == "devicemotion" and d.data.shape[0]>0) and d.timeSeries[0].end-d.timeSeries[0].start>10000] )
concanated_data


Unnamed: 0,time,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,participantId,activity,mobile,browser
40,2024-05-04 06:37:51.934,-0.3,-0.7,-2.6,-0.5,4.0,6.3,12.5,7.5,41.599998,111f7,testing,UnknownPhone,Chrome
41,2024-05-04 06:37:51.950,-0.2,-0.2,-3.0,-0.1,4.3,6.7,9.6,-0.1,36.099998,111f7,testing,UnknownPhone,Chrome
42,2024-05-04 06:37:51.967,0.3,0.3,-2.7,0.1,4.3,8.2,12.7,-4.5,28.400000,111f7,testing,UnknownPhone,Chrome
43,2024-05-04 06:37:51.984,0.5,0.2,-1.0,0.1,4.3,8.1,25.4,1.8,13.600000,111f7,testing,UnknownPhone,Chrome
44,2024-05-04 06:37:52.001,0.5,0.2,-1.0,-0.1,4.4,8.0,25.4,1.8,13.600000,111f7,testing,UnknownPhone,Chrome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2024-05-07 12:03:57.696,0.1,0.0,0.1,0.3,4.8,8.6,-3.3,3.5,-2.000000,uksco,sitting,UnknownPhone,Chrome
85,2024-05-07 12:03:57.712,0.1,0.0,0.1,0.3,4.8,8.6,-0.8,1.3,-1.400000,uksco,sitting,UnknownPhone,Chrome
86,2024-05-07 12:03:57.729,0.0,0.0,0.0,0.3,4.8,8.6,0.4,-1.3,-0.900000,uksco,sitting,UnknownPhone,Chrome
87,2024-05-07 12:03:57.746,0.0,0.0,0.0,0.3,4.7,8.5,0.2,-1.9,-0.800000,uksco,sitting,UnknownPhone,Chrome


In [9]:
# Filter out rows where activity is "testing"
filtered_data=concanated_data[concanated_data.activity!="testing"]

# Drop "mobile" and "browser" columns and convert "browser" to dummy variables
filtered_data=pd.concat([filtered_data.drop(columns=["mobile","browser"]),pd.get_dummies(filtered_data[["browser"]])], axis=1)

# Set the index to "time" and drop the "time" column
filtered_data.index=filtered_data.time
filtered_data=filtered_data.drop(columns="time")

# Factorize the "activity" column and store the labels
filtered_data["activity"],labels=filtered_data["activity"].factorize()
filtered_data

Unnamed: 0_level_0,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,participantId,activity,browser_Chrome,browser_Firefox
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-05-04 06:39:09.013,-7.1,3.5,2.4,-0.5,-3.8,3.0,55.900002,-128.300003,-27.900000,12417,0,False,False
2024-05-04 06:39:09.030,-7.9,2.7,4.1,-1.3,-4.5,3.8,69.099998,-55.400002,0.500000,12417,0,False,False
2024-05-04 06:39:09.046,-8.6,2.0,4.3,-1.7,-4.7,3.2,59.000000,-8.300000,20.500000,12417,0,False,False
2024-05-04 06:39:09.063,-9.4,2.0,2.0,-2.8,-5.0,1.5,48.700001,15.600000,45.200001,12417,0,False,False
2024-05-04 06:39:09.080,-9.4,2.0,2.0,-2.8,-5.0,1.5,43.599998,24.799999,75.099998,12417,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-07 12:03:57.696,0.1,0.0,0.1,0.3,4.8,8.6,-3.300000,3.500000,-2.000000,uksco,2,True,False
2024-05-07 12:03:57.712,0.1,0.0,0.1,0.3,4.8,8.6,-0.800000,1.300000,-1.400000,uksco,2,True,False
2024-05-07 12:03:57.729,0.0,0.0,0.0,0.3,4.8,8.6,0.400000,-1.300000,-0.900000,uksco,2,True,False
2024-05-07 12:03:57.746,0.0,0.0,0.0,0.3,4.7,8.5,0.200000,-1.900000,-0.800000,uksco,2,True,False


In [10]:
# Define a function to compute the mode
def mode(x): 
    return (x.mode()[0])

# Define aggregation functions for different column
aggs={ **dict.fromkeys(filtered_data.select_dtypes(exclude=[np.float32]).columns, [ lambda x : x.mode()[0]  ]),
      **dict.fromkeys(filtered_data.select_dtypes(include=[np.float32]).columns, ["mean","var","min","max","median"])}
del(aggs["participantId"])

In [None]:
# Apply rolling window aggregation grouped by "participantId"
windowed_data=filtered_data.groupby("participantId").rolling("1s").agg(aggs)
windowed_data.columns = [col[0] + "." + col[1] if col [1] != "<lambda>" else col[0] for col in windowed_data.columns]
windowed_data

In [10]:
# Prepare the dataframe for modeling
windowed_data_copy=windowed_data.copy()
windowed_data_copy.index=windowed_data.index.droplevel(1)
windowed_data_copy=windowed_data_copy.reset_index()
windowed_data_copy.dropna(inplace=True)
windowed_data_copy

Unnamed: 0,participantId,activity,browser_Chrome,browser_Firefox,acceleration.x.mean,acceleration.x.var,acceleration.x.min,acceleration.x.max,acceleration.x.median,acceleration.y.mean,...,rotationRate.beta.mean,rotationRate.beta.var,rotationRate.beta.min,rotationRate.beta.max,rotationRate.beta.median,rotationRate.gamma.mean,rotationRate.gamma.var,rotationRate.gamma.min,rotationRate.gamma.max,rotationRate.gamma.median
1,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
2,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
3,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
4,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
5,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121798,uksco,2.0,1.0,0.0,-0.020000,0.008909,-0.2,0.1,0.0,-0.006667,...,0.900000,15.907273,-9.80,9.00,1.10,-0.275556,4.601889,-3.80,5.20,-0.80
121799,uksco,2.0,1.0,0.0,-0.017391,0.009024,-0.2,0.1,0.0,-0.006522,...,0.908696,15.557256,-9.80,9.00,1.10,-0.300000,4.527111,-3.80,5.20,-0.85
121800,uksco,2.0,1.0,0.0,-0.017021,0.008834,-0.2,0.1,0.0,-0.006383,...,0.861702,15.322850,-9.80,9.00,1.10,-0.312766,4.436355,-3.80,5.20,-0.90
121801,uksco,2.0,1.0,0.0,-0.016667,0.008652,-0.2,0.1,0.0,-0.006250,...,0.804167,15.155727,-9.80,9.00,0.95,-0.322917,4.346910,-3.80,5.20,-0.85


In [1]:
# Plot a sample of the data using seaborn pairplot
sampled_data=windowed_data_copy.sample(1000)
sns.pairplot(sampled_data, hue='activity')
plt.show()

NameError: name 'windowed_data_copy' is not defined

In [12]:
# Define the classifier
clf = RandomForestClassifier(random_state=0)

# Prepare features and labels for cross-validation and RFECV
X = sampled_data.drop(['activity','participantId'], axis=1) # Features
y = sampled_data['activity'].ravel() # Labels
groups = sampled_data['participantId'] # Group identifiers

# Perform cross-validation using Leave-One-Group-Out
scores=cross_val_score(clf, X, y, cv=LeaveOneGroupOut(), groups=groups, scoring="f1_macro")
scores.mean()

  y = sampled_data['activity'].ravel() # Labels


0.5785533384652329

In [13]:
# Perform Recursive Feature Elimination with Cross-Validation (RFECV)
rfecv = RFECV(estimator=clf, step=1, scoring='f1_macro')
rfecv = rfecv.fit(X, y) 

In [2]:
# Plot RFECV results
cv_results = pd.DataFrame(rfecv.cv_results_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    x=cv_results.index,
    y=cv_results["mean_test_score"],
    yerr=cv_results["std_test_score"],
)
plt.title("Recursive Feature Elimination")
plt.show()

NameError: name 'pd' is not defined

In [15]:
windowed_data_copy

Unnamed: 0,participantId,activity,browser_Chrome,browser_Firefox,acceleration.x.mean,acceleration.x.var,acceleration.x.min,acceleration.x.max,acceleration.x.median,acceleration.y.mean,...,rotationRate.beta.mean,rotationRate.beta.var,rotationRate.beta.min,rotationRate.beta.max,rotationRate.beta.median,rotationRate.gamma.mean,rotationRate.gamma.var,rotationRate.gamma.min,rotationRate.gamma.max,rotationRate.gamma.median
1,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
2,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
3,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
4,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
5,10280,2.0,0.0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.020000,...,-0.030000,0.000000,-0.03,-0.03,-0.03,-0.060000,0.000000,-0.06,-0.06,-0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121798,uksco,2.0,1.0,0.0,-0.020000,0.008909,-0.2,0.1,0.0,-0.006667,...,0.900000,15.907273,-9.80,9.00,1.10,-0.275556,4.601889,-3.80,5.20,-0.80
121799,uksco,2.0,1.0,0.0,-0.017391,0.009024,-0.2,0.1,0.0,-0.006522,...,0.908696,15.557256,-9.80,9.00,1.10,-0.300000,4.527111,-3.80,5.20,-0.85
121800,uksco,2.0,1.0,0.0,-0.017021,0.008834,-0.2,0.1,0.0,-0.006383,...,0.861702,15.322850,-9.80,9.00,1.10,-0.312766,4.436355,-3.80,5.20,-0.90
121801,uksco,2.0,1.0,0.0,-0.016667,0.008652,-0.2,0.1,0.0,-0.006250,...,0.804167,15.155727,-9.80,9.00,0.95,-0.322917,4.346910,-3.80,5.20,-0.85


In [18]:
# Prepare the final result dataframe with selected features
result_data=pd.concat([windowed_data_copy.participantId,pd.DataFrame(labels[windowed_data_copy.activity.astype(int)],columns=["activity"]),windowed_data_copy[rfecv.get_feature_names_out(X.columns)]],axis=1).dropna()
df6

Unnamed: 0,participantId,activity,acceleration.y.var,acceleration.y.min,acceleration.z.var,accelerationIncludingGravity.x.mean,accelerationIncludingGravity.x.median,accelerationIncludingGravity.y.mean,accelerationIncludingGravity.y.var,accelerationIncludingGravity.y.min,...,accelerationIncludingGravity.y.median,accelerationIncludingGravity.z.var,accelerationIncludingGravity.z.min,accelerationIncludingGravity.z.max,accelerationIncludingGravity.z.median,rotationRate.alpha.var,rotationRate.alpha.min,rotationRate.beta.var,rotationRate.beta.max,rotationRate.gamma.var
1,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000000,9.840000,9.84,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
2,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000000,9.840000,9.84,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
3,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000025,9.840000,9.85,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
4,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000030,9.840000,9.85,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
5,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000030,9.840000,9.85,9.845,0.000000,0.000000,0.000000,-0.030000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121717,uksco,sitting,9.122666,-2.70,156.284988,17.680000,18.900001,-1.930000,3.013445,-3.30,...,-2.30,99.938769,-32.099998,1.50,-1.700,1655.137330,90.300003,8818.709404,103.500000,8896.485613
121718,uksco,sitting,8.215636,-2.70,140.694899,17.263637,18.600000,-1.863636,2.760546,-3.30,...,-2.30,94.776174,-32.099998,1.50,-2.100,12838.261634,-205.899994,9344.820570,103.500000,9853.118897
121719,uksco,sitting,7.480909,-2.70,129.451809,16.675000,17.150000,-1.908333,2.533561,-3.30,...,-2.30,87.902644,-32.099998,1.50,-1.700,17064.427240,-205.899994,13181.429436,215.300003,10292.051910
121720,uksco,sitting,6.997692,-2.70,122.819735,15.984616,15.700000,-2.061538,2.627564,-3.90,...,-2.30,85.010762,-32.099998,2.60,-1.300,17723.949377,-205.899994,14116.102395,215.300003,10373.772592


In [3]:
# Plot the activity distribution
result_data.activity.hist()

NameError: name 'result_data' is not defined

In [20]:
# Save the final features to a pickle file
with open("../exercises/features.pkl", "wb") as f:
    pickle.dump(result_data, f)

In [21]:
result_data

Unnamed: 0,participantId,activity,acceleration.y.var,acceleration.y.min,acceleration.z.var,accelerationIncludingGravity.x.mean,accelerationIncludingGravity.x.median,accelerationIncludingGravity.y.mean,accelerationIncludingGravity.y.var,accelerationIncludingGravity.y.min,...,accelerationIncludingGravity.y.median,accelerationIncludingGravity.z.var,accelerationIncludingGravity.z.min,accelerationIncludingGravity.z.max,accelerationIncludingGravity.z.median,rotationRate.alpha.var,rotationRate.alpha.min,rotationRate.beta.var,rotationRate.beta.max,rotationRate.gamma.var
1,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000000,9.840000,9.84,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
2,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000000,9.840000,9.84,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
3,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000025,9.840000,9.85,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
4,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000030,9.840000,9.85,9.840,0.000000,0.000000,0.000000,-0.030000,0.000000
5,10280,sitting,0.000000,0.02,0.000000,0.030000,0.030000,0.030000,0.000000,0.03,...,0.03,0.000030,9.840000,9.85,9.845,0.000000,0.000000,0.000000,-0.030000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121717,uksco,sitting,9.122666,-2.70,156.284988,17.680000,18.900001,-1.930000,3.013445,-3.30,...,-2.30,99.938769,-32.099998,1.50,-1.700,1655.137330,90.300003,8818.709404,103.500000,8896.485613
121718,uksco,sitting,8.215636,-2.70,140.694899,17.263637,18.600000,-1.863636,2.760546,-3.30,...,-2.30,94.776174,-32.099998,1.50,-2.100,12838.261634,-205.899994,9344.820570,103.500000,9853.118897
121719,uksco,sitting,7.480909,-2.70,129.451809,16.675000,17.150000,-1.908333,2.533561,-3.30,...,-2.30,87.902644,-32.099998,1.50,-1.700,17064.427240,-205.899994,13181.429436,215.300003,10292.051910
121720,uksco,sitting,6.997692,-2.70,122.819735,15.984616,15.700000,-2.061538,2.627564,-3.90,...,-2.30,85.010762,-32.099998,2.60,-1.300,17723.949377,-205.899994,14116.102395,215.300003,10373.772592
