# Competition Project: Terrain identification from accelerometer and gyroscope data using a cnn-lstm neural network

In [None]:
import os
import pandas as pd
import numpy as np
import torch
#from google.colab import drive
#drive.mount('/content/drive')


In [None]:
flag_cuda = torch.cuda.is_available()

if not flag_cuda:
    print('Using CPU')
else:
    print('Using GPU')

In [None]:
#moving into the directory with the project files
%cd /kaggle/input/comp-proj/ECE542_fa2021_Project_TerrainRecognition/ 


# Loading in the Training Data

In [None]:
#these are the number of sessions that each subject did
sub1ses= 8
sub2ses= 5
sub3ses= 3
sub4ses= 2
sub5ses= 3
sub6ses= 3
sub7ses= 4
sub8ses= 1
#list to store the training and testing data
Training= []
fileID=0
column_list = ['ax','ay','az','gx','gy','gz','subject_id', 'time', 'label']
Session_Data = pd.DataFrame([], columns = column_list)



#array that conntains the number of sessions per person to itterate throgh the files
num_ses_for_sub= [0,sub1ses, sub2ses, sub3ses, sub4ses, sub5ses, sub6ses, sub7ses, sub8ses]
#in case we want to implement another method for loading the data 
method_load= 0
#perTrain= 0.8

if method_load==0:
    for numsubjects in range(1,9):
        for sessionID in range(1,num_ses_for_sub[numsubjects]+1):
            print('Reading:./TrainingData/subject_{sub_id:03}_{sess_id:02}__x.csv'.format(sub_id=numsubjects,sess_id=sessionID))
            X=pd.read_csv('./TrainingData/subject_{sub_id:03}_{sess_id:02}__x.csv'.format(sub_id=numsubjects,sess_id=sessionID),names=('ax','ay','az','gx','gy','gz'))
            Y=pd.read_csv('./TrainingData/subject_{sub_id:03}_{sess_id:02}__y.csv'.format(sub_id=numsubjects,sess_id=sessionID),names=('label',))
            x_t=pd.read_csv('./TrainingData/subject_{sub_id:03}_{sess_id:02}__x_time.csv'.format(sub_id=numsubjects,sess_id=sessionID), names= ('time', ))
            y_t=pd.read_csv('./TrainingData/subject_{sub_id:03}_{sess_id:02}__y_time.csv'.format(sub_id=numsubjects,sess_id=sessionID), names= ('time', ))
            subject_ID= [numsubjects] * len(x_t)
            

            X.insert(X.shape[1], 'subject_id', subject_ID)
            X.insert(X.shape[1],'time', x_t)
            Y.insert(0,'time', y_t)
            #upsample the labels
            X.insert(X.shape[1], 'label', Y.label[len(Y) - 1])
            ii = 0
            with pd.option_context('mode.chained_assignment', None):
                for jj in range(0, len(X)):
                    X.label[jj] = Y.label[ii]
                    while ((ii < (len(Y) - 1)) and (X.time[jj] >= Y.time[ii + 1])):
                        ii += 1
            Session_Data = pd.concat([Session_Data, X], ignore_index=True)

# Importing modules for the preprocessing of the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob 
import numpy as np
import random
import sklearn.metrics
from collections import Counter
from sklearn.preprocessing import RobustScaler
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from statsmodels import robust
from scipy import  stats
from scipy.stats import entropy
from sklearn.model_selection import train_test_split

In [None]:
Session_Data.head()

In [None]:
x_data=Session_Data[Session_Data.columns[:6]]
x_data.head()

In [None]:
x_data=Session_Data[Session_Data.columns[:6]]
scaler = RobustScaler()

scaler = scaler.fit(x_data)

Session_Data.loc[:, Session_Data.columns[:6]] = scaler.transform((Session_Data[Session_Data.columns[:6]]).to_numpy())

In [None]:
Session_Data.head()

In [None]:
def extract_windows(X, Y, Window_Size=40, stride=1):
    X_windows, Y_windows = [], []
      
    for i in range(0, len(X) - Window_Size, stride):
        u = X.iloc[i:(i + Window_Size)].values
        labels = Y.iloc[i: i + Window_Size]
        X_windows.append(u)
        Y_windows.append(stats.mode(labels)[0][0])
    return np.array(X_windows), np.array(Y_windows).reshape(-1, 1)

In [None]:
Window_Size = 40  # Window Size
stride = 1
x_data=Session_Data[Session_Data.columns[:6]]
y_data=Session_Data.label
X_data, Y_data = extract_windows(x_data,y_data,Window_Size,stride)

In [None]:
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
rus.fit_resample(X_data[:,:,0], Y_data)
X_data = X_data[rus.sample_indices_]
Y_data = Y_data[rus.sample_indices_]

In [None]:
X_data = np.expand_dims(X_data, axis=1)#insert a channel dimension for the conv layers

In [None]:
print(X_data.shape)
print(Y_data.shape)

# Importing modules for the model creation and training

In [None]:
!pip install pyfiglet 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from multiprocessing import cpu_count
from pathlib import Path
from tqdm.autonotebook import tqdm
import os
from pyfiglet import Figlet


# Splitting the data into training and validation sets

In [None]:
val_size = 0.2  # validation size (80:20 split )
#encoding the labels
enc = LabelEncoder()
Y_data = enc.fit_transform(Y_data)
X_train, X_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=val_size)

In [None]:
def create_data_loaders(train_ds, valid_ds, bs=64, jobs=0):
    train_dl = DataLoader(train_ds, bs, shuffle=True, num_workers=jobs,drop_last=True )
    valid_dl = DataLoader(valid_ds, bs, shuffle=False, num_workers=jobs,drop_last=True)
    return train_dl, valid_dl


In [None]:
X_train, X_val = [torch.tensor(arr, dtype=torch.float32) for arr in (X_train, X_val)]
y_train, y_val = [torch.tensor(arr, dtype=torch.long) for arr in (y_train, y_val)]
train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_val, y_val)

# Splitting the data into batches 

In [None]:
bs = 512
trn_dl, val_dl = create_data_loaders(train_ds, valid_ds, bs, jobs=cpu_count())

# Model Definition 
4 2d convolution layers followed by an lstm layer and then a fully connected layer

In [None]:
class CNNAndLstm(nn.Module):    
    def __init__(self):
        super().__init__()
        self.conv2d1= nn.Conv2d(in_channels=1, out_channels=120, kernel_size=(5,1))
        self.conv2d2= nn.Conv2d(out_channels=120, kernel_size=(5,1),in_channels=120)
        self.conv2d3= nn.Conv2d(out_channels=120, kernel_size=(5,1),in_channels=120 )
        self.conv2d4= nn.Conv2d(out_channels=120, kernel_size=(5,1),in_channels=120 )
        self.lstm1 = nn.LSTM(6*120, 128, batch_first=True)
        self.fc = nn.Linear(128, 4)    
    def forward(self, x):
        Window=x.size(2)
        bsn= x.size(0)
        if flag_cuda: h01, c01 = torch.randn(1, bsn, 128).cuda(), torch.randn(1, bsn, 128).cuda()
        if not flag_cuda: h01, c01 = torch.randn(1, bsn, 128), torch.randn(1, bsn, 128)
        x=F.relu(self.conv2d1(x))
        x=F.relu(self.conv2d2(x))
        x=F.relu(self.conv2d3(x))
        x=F.relu(self.conv2d4(x))
        x= x.reshape(bsn,(Window-16), 6*120)
        lstm_out, _,=self.lstm1(x,(h01,c01))
        output = self.fc(lstm_out[:, -1, :])

        return output
    

# making a directory to save the model files

In [None]:
!rm /kaggle/working/model -rv

In [None]:
!mkdir /kaggle/working/model


# Training the model and saving the best ones as it trains 

In [None]:
lr = 0.0001
n_epochs = 100
best_accuracy = 0
output_dim=4

model = CNNAndLstm()
if flag_cuda: model = model.cuda()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=lr)
sched = None

print(':::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n')
print('-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n')

f = Figlet(font='slant')
print(f.renderText('Training Start!'))


train_loss_list = []
valid_loss_list = []
for epoch in tqdm(range(1, n_epochs + 1)):
    
    train_loss = 0.0
    valid_loss = 0.0
    model.train()

    for data, target in trn_dl:
        if flag_cuda:
              data, target = data.cuda(), target.cuda()
        # Clearing the gradients of all optimized variables
        opt.zero_grad()
        #switching the order of the dimensions since the conv layers expects (N, C_{in}, H_{in}, W_{in})
        #data= data.transpose(1,3)
        #data=data.transpose(2,3)
        # Forward pass: Computing predicted outputs
        output = model(data)
        # Calculating the batch loss
        loss = criterion(output, target)
        # Backward pass: compute gradient of loss with respect to parameters
        loss.backward()
        # Perform a single optimization step (parameter update)
        opt.step()
        # Update training loss
        train_loss += loss.item() * data.size(0)
    train_loss_list.append(train_loss)
    #set model to evaluation so that we can compute the validation loss
    model.eval()
    correct, total = 0, 0
    for x_val, y_val in val_dl:
        x_val, y_val = [t.cuda() for t in (x_val, y_val)]
        #x_val= x_val.transpose(1,3)
        #x_val=x_val.transpose(2,3)
        out = model(x_val)
        prediction = F.log_softmax(out, dim=1).argmax(dim=1)
        total += y_val.size(0)
        correct += (prediction == y_val).sum().item()
    
        loss = criterion(out, y_val)
        valid_loss += loss.item()*x_val.size(0)
    valid_loss_list.append(valid_loss)

    acc = correct / total

    print('Epoch: {} \tTraining Loss: {:.5f} \tValidation Loss: {:.5f}'.format(epoch, train_loss, valid_loss))

    if acc > best_accuracy:
        best_accuracy = acc
        !rm /kaggle/working/model/* -v
        torch.save(model.state_dict(), f'/kaggle/working/model/bestcnn_{best_accuracy:2.3%}.pth')
        print(f'Saving New Best Model With an accuracy of: {best_accuracy:2.3%}')
    else:
        print(f'Did not get a better model at epoch: {epoch}')
print(':::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n')
print('-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n')
f = Figlet(font='slant')
print(f.renderText('Training End!'))

# Plotting the training and validation loss 

In [None]:
epochs_list= np.arange(1,n_epochs+1,1)
plt.plot(epochs_list, train_loss_list, epochs_list, valid_loss_list)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['Training','Validation'])
plt.title("Performance of Baseline Model")
plt.show()

# Generating Final predictions for the leaderboard

In [None]:


#function to filter the data from the predictions using a sliding window
def filter_out(pred_df): 
    WINDOW=20
    arr=np.asarray(pred_df).squeeze()
    for ii in range(0, len(arr)-WINDOW, 1):
        z= arr[ii:ii+WINDOW]
        mode_of_window= mode(z)[0]
        c_val= arr[ii]
    if arr[ii] != mode_of_window:
        arr[ii]= int(mode_of_window) 
    
    return arr



In [None]:
WINDOW = 40  # Window Size
STRIDE = 1
output_dim = 4
model = CNNAndLstm()
if flag_cuda: model = model.cuda()
scale_columns=Session_Data.columns[:6]
model_path = '/kaggle/working/model/'
model_file= glob.glob(model_path + '*.pth')[0]
if flag_cuda: model.load_state_dict(torch.load(model_file)) #path to the best model
if not flag_cuda: model.load_state_dict(torch.load(model_file,map_location=torch.device('cpu')))  #path to the best model
model.eval()
dir_path = '/kaggle/input/comp-proj/ECE542_fa2021_Project_TerrainRecognition/TestData/'
column_list = ['ax', 'ay', 'az', 'gx', 'gy', 'gz']
df_test_data = pd.DataFrame([], columns = column_list)
label_files=[]
## Reading Test files 
for idx in glob.glob(dir_path + '*.csv'):
    file_type = idx.split('.')[0].split('__')[1]

    if file_type == 'x':
        subject_name = idx.split('.')[0].split('__')[0].split('/')[-1]
        x_file = idx.split('.')[0].split('__')[0] + '__' + file_type + '.csv'
        df_x = pd.read_csv(x_file, names=column_list[:6])
        
        df_x = df_x.iloc[0:]
        df_x.loc[:, column_list] = scaler.transform(df_x[column_list].to_numpy())
        df_x.insert(df_x.shape[1], 'label', -1)        
        ## Converting windows for test data 
        x_test, y_test = extract_windows(df_x[column_list],df_x.label,WINDOW,STRIDE )
        #print("len(x_test)", len(x_test))        
        #Running inference on test data with bs of 128 just like the training data
        bs=128
        x_test = np.expand_dims(x_test, 1)
        x_test = torch.tensor(x_test, dtype=torch.float32)#convert to torch tensor 
        y_test = torch.tensor(y_test, dtype=torch.long)#convert to torch tensor 
        
        test_ds= TensorDataset(x_test,y_test) #create a dataset with this data 
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=cpu_count(),drop_last=False )#not dropping the last batch so we get all the test values
        predictions_final= pd.DataFrame(columns=['label'])
        for x_test, y_test in test_dl:
            
            if flag_cuda: x_test, y_test=  x_test.cuda(), y_test.cuda() #move to GPU
            out = model(x_test)
            predictions = F.log_softmax(out, dim=1).argmax(dim=1)
            predictions = list(predictions)
            predictions = np.asarray(predictions)
            predictions = pd.DataFrame(predictions, columns=['label'])
            predictions_final= predictions_final.append(predictions, ignore_index=True)
        #add the last window as just zero labels
        last_window= np.zeros(WINDOW-1)
        last_window_df= pd.DataFrame(last_window,columns=['label'])
        predictions_final= last_window_df.append(predictions_final, ignore_index=True)

        #downsample
        predictions_final_downsampled = predictions_final['label'].rolling(window=4, min_periods=1).apply(lambda x: mode(x)[0])[::4]

        predictions_final_downsampled = pd.DataFrame(predictions_final_downsampled.values)
        #filter the lables to get cleaner output trying to remove any short spikes of a label
        filtered_arr= filter_out(predictions_final_downsampled)
        predictions_final_downsampled = pd.DataFrame(filtered_arr)
        #print(pred_df)
        print("Saving file:", subject_name, "__y.csv")
        predictions_final_downsampled.to_csv('/kaggle/working/' + subject_name + '__y.csv', index=False, columns=None)

In [None]:
#plotting the last prediction that was generated
print(len(pred_df))
lablesarr=np.asarray(pred_df).squeeze()
print(lablesarr)
timelist= np.arange(0,9473,1)
plt.figure(figsize=(30,10))
plt.plot(timelist, lablesarr)
plt.xlabel("time")
plt.ylabel("label")
plt.legend(['Prediction'])
plt.title("Labels")
plt.show()