In [1]:
import numpy as np # linear algebra
import pandas as pd
import torch
import os
import matplotlib.pyplot as plt
from torch.utils import data
import gc
from sklearn.preprocessing import LabelEncoder
import pickle
import shutil
from time import time

# pickle funcs

In [2]:
def load_pkl(file_path):
    with open(file_path + '.pkl', 'rb') as f:
        x = pickle.load(f)
    return x

def save_pkl(obj, file_name):
    with open(file_name + '.pkl', "wb") as f:
        pickle.dump(obj, f)

# load data friles from google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/DL_workshop/smooth/type1.pkl /content/
!cp /content/drive/MyDrive/DL_workshop/smooth/type2.pkl /content/
!cp /content/drive/MyDrive/DL_workshop/metadata.parquet /content/
!cp /content/drive/MyDrive/DL_workshop/LE.pkl /content/

Mounted at /content/drive


# Load smoothed data here

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/DL_workshop/smooth/type1.pkl /content/
!cp /content/drive/MyDrive/DL_workshop/smooth/type2.pkl /content/
!cp /content/drive/MyDrive/DL_workshop/metadata.parquet /content/
!cp /content/drive/MyDrive/DL_workshop/LE.pkl /content/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Some processing

In [4]:
type1 = load_pkl('type1')
type2 = load_pkl('type2')
metadata = pd.read_parquet('metadata.parquet')
LE = load_pkl('LE')

## Normalize data

In [None]:
# type1['seq'].values[254].shape

global_mean = torch.tensor([0.0, 0.0, 0.0])
global_std = torch.tensor([0.0, 0.0, 0.0])

def norma_f(r):

    global global_mean
    global global_std

    global_mean += r.mean(axis=0)
    global_std += r.std(axis = 0)

type1['seq'].apply(func=(norma_f))
type1_norm = (global_mean / len(type1)), (global_std / len(type1))

global_mean = torch.tensor([0.0, 0.0, 0.0])
global_std = torch.tensor([0.0, 0.0, 0.0])

type2['seq'].apply(func=(norma_f))
type2_norm = (global_mean / len(type2)), (global_std / len(type2))

In [None]:
def standardize_seqs(data, m, s):
    '''
    data: dataframe
    m: mean
    s: std
    '''
    eps = 0.0000000000001
    def stand_scaler(r):
        r = (r - m )/ (s+ eps)
        return r

    data.loc['seq'] = data['seq'].apply(stand_scaler)
    return data

type1 =  standardize_seqs(type1, type1_norm[0], type1_norm[1])
type2 =  standardize_seqs(type2, type2_norm[0], type2_norm[1])

  data.loc['seq'] = data['seq'].apply(stand_scaler)
  data.loc['seq'] = data['seq'].apply(stand_scaler)


In [5]:
type1['seq'] = type1['seqs']
type2['seq'] = type2['seqs']

type1.drop(['seqs'], axis=1, inplace=True)
type2.drop(['seqs'], axis=1, inplace=True)

In [6]:
processed = pd.concat([type1, type2])
processed.dropna(inplace=True)
processed.shape

(50248, 4)

In [7]:
# processed.drop(['Unnamed: 0', 'userid', 'sensor', 'body_part', 'side', 'sequence_length'], axis=1, inplace=True)

processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50248 entries, 0 to 50246
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   activity  50248 non-null  int64 
 1   id        50248 non-null  int64 
 2   type      50248 non-null  bool  
 3   seq       50248 non-null  object
dtypes: bool(1), int64(2), object(1)
memory usage: 1.6+ MB


In [9]:
processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50248 entries, 0 to 50246
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   activity  50248 non-null  int64 
 1   id        50248 non-null  int64 
 2   type      50248 non-null  bool  
 3   seqs      50248 non-null  object
dtypes: bool(1), int64(2), object(1)
memory usage: 1.6+ MB


In [20]:
def check_seqs_nulls(r):
    if torch.isnan(r['seq']).sum().item() >0:
        return True
    return False


n = processed.apply(check_seqs_nulls, axis=1).sum()

f"there are {n} nulls"

'there are 0 nulls'

# Dataset class

In [8]:
class FE_SensorDataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.metadata = data
        # self.seq_data_dir = seq_data_dir
        self.transform = transform

    def __len__(self):
        return len(self.metadata)


    def extract_features(self, seqs):
        '''
        Extract features from the given sequences
        :data: Tensor, shape: (seq_size, 3), sequences of x,y,z stored in a tensor
        '''
        features = []
        n_segments = 1
        # if isinstance(seqs, float):
        #     print(data)
        n_samples = seqs.shape[0]

        curr_segment = 0
        step = (n_samples // n_segments) + 1

        for idx in range(0, n_samples, step):
            data = seqs[idx:idx + (step)]

            means = data.mean(axis=0)
            x_mean = means[0].item()
            y_mean = means[1].item()
            z_mean = means[2].item()

            stds = data.std(axis=0)
            x_std = stds[0].item()
            y_std = stds[1].item()
            z_std = stds[2].item()

            maxs, _ = data.max(axis=0)
            # print(maxs)
            x_max = maxs[0].item()
            y_max = maxs[1].item()
            z_max = maxs[2].item()

            mins, _ = data.min(axis=0)
            x_min = mins[0].item()
            y_min = mins[1].item()
            z_min = mins[2].item()

            mids, _ = data.median(axis=0)
            x_mid = mids[0].item()
            y_mid = mids[1].item()
            z_mid = mids[2].item()
            features.extend([x_mean, y_mean, z_mean, x_std, y_std, z_std, x_max, y_max, z_max, x_min, y_min, z_min, x_mid, y_mid, z_mid])

        features =  torch.tensor(features)
        # noise = torch.randn(len(features))
        noise = torch.randn_like(features) * 0.09
        noise = torch.zeros(len(features))
        return features + noise

        # return torch.tensor(list(features.values

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample_meta = self.metadata.iloc[idx]
        # sample_id = sample_meta['id']

        # file_path = os.path.join(self.seq_data_dir, f"{sample_id}.csv")
        # raw_data = pd.read_csv(file_path)

        # if 'measurement type' in raw_data.columns:
        #     #TODO : we might consider using another measurements
        #     raw_data = raw_data[raw_data['measurement type'] == 'acceleration [m/s/s]']
        #     sensor_data = torch.tensor(raw_data[['x', 'y', 'z']].values.astype(np.float32))
        # else:
        #     sensor_data = torch.tensor(raw_data[['x [m]', 'y [m]', 'z [m]']].values.astype(np.float32))

        label = sample_meta['activity']

        if self.transform:
            sensor_data = self.transform(sensor_data)


        features = self.extract_features(sample_meta['seq'])

        return features, label

# Define train/test loaders

In [9]:

sensor_dataset = FE_SensorDataset(processed)

train_set, test_set = torch.utils.data.random_split(sensor_dataset, [0.8, 0.2])

batch_size = 32
train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
# valid_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_set, batch_size=batch_size, shuffle=True)


len(train_set), len(test_set)


(40199, 10049)

# Build & Train classical ML models

### Define test func

In [73]:
print("Testing")
def test_model(model):
    global test_loader
    global batch_size

    n_corrects = 0
    n_samples = len(test_loader) * batch_size

    for i, batch in enumerate(test_loader):

        (features, labels) = batch
    #     print(features.numpy().shape)
    #     break
        x_batch = features.cpu().numpy()
        y_batch = np.array(labels)

        y_pred = model.predict(x_batch)

        n_corrects += (y_pred == y_batch).sum()

    acc = n_corrects / n_samples
    print(f"Test acc: {acc}")
    return acc

Testing


## Naive Bayes

In [74]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


n_batches = len(train_loader)
# x_train = [0] * (n_batches)
# y_train = [0]* (n_batches)
nb_classifier = GaussianNB()
n_samples = len(train_loader) * batch_size
# classes = np.unique(train_df['activity'])
classes = list(LE.values())
n_corrects = 0
total_losses = 0
print("Training ...")
since = time()
for i, batch in enumerate(train_loader):

    print(f"batch {i}/{n_batches}")
    (features, labels) = batch
    # print(features.numpy().shape)
    # break
    x_batch = features.cpu().numpy()
    y_batch = np.array(labels)

    nb_classifier.partial_fit(x_batch, y_batch, classes=classes)

    y_pred = nb_classifier.predict(x_batch)
    n_corrects += (y_pred == y_batch).sum()


    # del x_batch
    # del y_batch
    # gc.collect()
elapsed = time() - since
print('training time: ',elapsed)
print(f'train acc: {n_corrects / n_samples}')
test_model(nb_classifier)
print("DONE")

Training ...
batch 0/1257
batch 1/1257
batch 2/1257
batch 3/1257
batch 4/1257
batch 5/1257
batch 6/1257


  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])


batch 7/1257
batch 8/1257
batch 9/1257
batch 10/1257
batch 11/1257
batch 12/1257
batch 13/1257
batch 14/1257
batch 15/1257
batch 16/1257
batch 17/1257
batch 18/1257
batch 19/1257
batch 20/1257
batch 21/1257
batch 22/1257
batch 23/1257
batch 24/1257
batch 25/1257
batch 26/1257
batch 27/1257
batch 28/1257
batch 29/1257
batch 30/1257
batch 31/1257
batch 32/1257
batch 33/1257
batch 34/1257
batch 35/1257
batch 36/1257
batch 37/1257
batch 38/1257
batch 39/1257
batch 40/1257
batch 41/1257
batch 42/1257
batch 43/1257
batch 44/1257
batch 45/1257
batch 46/1257
batch 47/1257
batch 48/1257
batch 49/1257
batch 50/1257
batch 51/1257
batch 52/1257
batch 53/1257
batch 54/1257
batch 55/1257
batch 56/1257
batch 57/1257
batch 58/1257
batch 59/1257
batch 60/1257
batch 61/1257
batch 62/1257
batch 63/1257
batch 64/1257
batch 65/1257
batch 66/1257
batch 67/1257
batch 68/1257
batch 69/1257
batch 70/1257
batch 71/1257
batch 72/1257
batch 73/1257
batch 74/1257
batch 75/1257
batch 76/1257
batch 77/1257
batch 78/

## XGboos

In [10]:
def xgb_test_model(model):
    global test_loader
    global batch_size

    n_corrects = 0
    n_samples = 0

    for i, batch in enumerate(test_loader):

        (features, labels) = batch
    #     print(features.numpy().shape)
    #     break
        x_batch = features.cpu().numpy()
        y_batch = np.array(labels)

        dtest = xgb.DMatrix(x_batch)
        y_pred = (model.predict(dtest))
        y_pred = np.argmax(y_pred, axis=1)
        n_corrects += (y_pred == y_batch).sum()
        n_samples += len(y_batch)

    acc = n_corrects / n_samples
    print(f"Test acc: {acc}")
    return acc

In [18]:
import xgboost as xgb
from sklearn.metrics import log_loss


n_batches = len(train_loader)
# x_train = [0] * (n_batches)
# y_train = [0]* (n_batches)
# nb_classifier = GaussianNB()
# classes = np.unique(train_df['activity'])
classes = list(LE.values())
n_samples = 0
params = {
    'objective': 'multi:softprob',  # for binary classification
    'num_class': len(classes)         # evaluation metric
}

print("Training ...")

# model = None
r_corrects = 0
since = time()
total_loss = 0
n = 200

all_preds = []

for i, batch in enumerate(train_loader):
    print(f"batch {i}/{n_batches}")
    (features, labels) = batch
#     print(features.numpy().shape)
#     break
    x_batch = features.cpu().numpy()
    y_batch = np.array(labels)

    dtrain = xgb.DMatrix(x_batch, label=y_batch)
    model = xgb.train(params, dtrain, xgb_model=model)

    # d_test = xgb.DMatrix(X_test)
    dtrain = xgb.DMatrix(x_batch)
    y_pred = model.predict(dtrain)
    # calculate cross entropy loss

    loss =log_loss(y_batch, y_pred, labels = list(range(18)))
    total_loss += loss
    # print(f"loss: {loss}")
    y_pred = np.argmax(y_pred, axis=1)
    r_corrects += (y_pred == y_batch).sum()
    n_samples += len(y_batch)

    if n == 0:
        break
    n -= 1

    # del x_batch
    # del y_batch
    # gc.collect()
elapsed = time() - since
print('training time: ',elapsed)
print(f'train acc: {r_corrects / n_samples}')
print(f"Train loss: {total_loss / n_batches}")

xgb_test_model(model)

Training ...
batch 0/1257
batch 1/1257
batch 2/1257
batch 3/1257
batch 4/1257
batch 5/1257
batch 6/1257
batch 7/1257
batch 8/1257
batch 9/1257
batch 10/1257
batch 11/1257
batch 12/1257
batch 13/1257
batch 14/1257
batch 15/1257
batch 16/1257
batch 17/1257
batch 18/1257
batch 19/1257
batch 20/1257
batch 21/1257
batch 22/1257
batch 23/1257
batch 24/1257
batch 25/1257
batch 26/1257
batch 27/1257
batch 28/1257
batch 29/1257
batch 30/1257
batch 31/1257
batch 32/1257
batch 33/1257
batch 34/1257
batch 35/1257
batch 36/1257
batch 37/1257
batch 38/1257
batch 39/1257
batch 40/1257
batch 41/1257
batch 42/1257
batch 43/1257
batch 44/1257
batch 45/1257
batch 46/1257
batch 47/1257
batch 48/1257
batch 49/1257
batch 50/1257
batch 51/1257
batch 52/1257
batch 53/1257
batch 54/1257
batch 55/1257
batch 56/1257
batch 57/1257
batch 58/1257
batch 59/1257
batch 60/1257
batch 61/1257
batch 62/1257
batch 63/1257
batch 64/1257
batch 65/1257
batch 66/1257
batch 67/1257
batch 68/1257
batch 69/1257
batch 70/1257
bat

0.3708826748930242

In [None]:
x_train = torch.cat([batch[0].squeeze(0) for batch in train_loader], dim=0)
y_train = np.array([i.item()  for batch in train_loader for i in batch[1]], dtype=int)

x_test = torch.cat([batch[0].squeeze(0) for batch in test_loader], dim=0)
y_test = np.array([i.item()  for batch in test_loader for i in batch[1]], dtype=int)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
# iris = load_iris()
# X, y = iris.data, iris.target

# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=20,
                                       random_state=358,
                                       max_depth=10,
                                       min_samples_split=2,
                                       criterion="log_loss")

# Train the Random Forest classifier
random_forest.fit(x_train, y_train)

# Predict on the test set
y_pred_test = random_forest.predict(x_test)
y_pred_train = random_forest.predict(x_train)
# Calculate accuracy
train_accaccuracy = accuracy_score(y_train, y_pred_train)
test_accaccuracy = accuracy_score(y_test, y_pred_test)
print("Train Accuracy:", train_accaccuracy)
print("Test Accuracy:", test_accaccuracy)

Train Accuracy: 0.31824174730714694
Test Accuracy: 0.09075529903472983


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# X, y = make_classification(n_samples=1000, n_features=4,
#                            n_informative=2, n_redundant=0,
#                            random_state=0, shuffle=False)
knn = LogisticRegression(multi_class='auto', solver='sag', max_iter=5000, penalty=None)
knn.fit(x_train, y_train)

y_pred_test = knn.predict(x_test)
y_pred_train = knn.predict(x_train)

train_accaccuracy = accuracy_score(y_train, y_pred_train)
test_accaccuracy = accuracy_score(y_test, y_pred_test)
print("Train Accuracy:", train_accaccuracy)
print("Test Accuracy:", test_accaccuracy)

Train Accuracy: 0.10997786014577476
Test Accuracy: 0.08856602647029556


In [None]:
ML_models = {'NB': nb_classifier,
          "XGB": model}

save_pkl(ML_models, 'ML_models_App1')