In [None]:
import numpy as np
import glob
import re
import shutil
import random
import pandas as pd
import tensorflow as tf
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.decomposition import KernelPCA

In [None]:
!unzip /content/DataBase.zip

Archive:  /content/DataBase.zip
   creating: DataBase/SETA/
  inflating: DataBase/SETA/healthy_open1.csv  
  inflating: DataBase/SETA/healthy_open10.csv  
  inflating: DataBase/SETA/healthy_open11.csv  
  inflating: DataBase/SETA/healthy_open12.csv  
  inflating: DataBase/SETA/healthy_open2.csv  
  inflating: DataBase/SETA/healthy_open3.csv  
  inflating: DataBase/SETA/healthy_open4.csv  
  inflating: DataBase/SETA/healthy_open5.csv  
  inflating: DataBase/SETA/healthy_open6.csv  
  inflating: DataBase/SETA/healthy_open7.csv  
  inflating: DataBase/SETA/healthy_open8.csv  
  inflating: DataBase/SETA/healthy_open9.csv  
   creating: DataBase/SETB/
  inflating: DataBase/SETB/healthy_closed1.csv  
  inflating: DataBase/SETB/healthy_closed10.csv  
  inflating: DataBase/SETB/healthy_closed11.csv  
  inflating: DataBase/SETB/healthy_closed12.csv  
  inflating: DataBase/SETB/healthy_closed2.csv  
  inflating: DataBase/SETB/healthy_closed3.csv  
  inflating: DataBase/SETB/healthy_closed4.csv  

In [None]:
files = glob.glob("/content/DataBase/*/*")

In [None]:
len(files)

48

Cleaning up the files which has non-int or non-float datatype which will be replaced by the previous timestep or its next time step

In [None]:
def clean(path):
    df = pd.read_csv(path)
    for column in df.columns:
        if df[column].dtype == 'object':
            print("Sample : ",path," feature : ",column," is uncleaned")
            df[column] = pd.to_numeric(df[column], errors='coerce')
            df[column] = df[column].fillna(method='ffill')
            df[column] = df[column].fillna(method='bfill')
    df = df.iloc[:1024,:]
    df.to_csv(path, index=False)

In [None]:
for i in files:
    clean(i)

Sample :  /content/DataBase/SETC/alzeimer_open8.csv  feature :  0  is uncleaned
Sample :  /content/DataBase/SETC/alzeimer_open10.csv  feature :  0  is uncleaned
Sample :  /content/DataBase/SETC/alzeimer_open5.csv  feature :  0  is uncleaned
Sample :  /content/DataBase/SETC/alzeimer_open6.csv  feature :  0  is uncleaned
Sample :  /content/DataBase/SETD/alzeimer_closed12.csv  feature :  18  is uncleaned
Sample :  /content/DataBase/SETB/healthy_closed2.csv  feature :  16  is uncleaned
Sample :  /content/DataBase/SETB/healthy_closed6.csv  feature :  14  is uncleaned
Sample :  /content/DataBase/SETA/healthy_open11.csv  feature :  14  is uncleaned
Sample :  /content/DataBase/SETA/healthy_open2.csv  feature :  16  is uncleaned


In [None]:
files = glob.glob("/content/DataBase/*/*")
len(files)

48

In [None]:
shutil.copytree("/content/DataBase","/content/DataBaseOriginal")

# Data Augmentation Techniques : 
  1.) **Shifting and Noising :** Shifting in time series data refers to a
transformation of the data where the entire series is moved forward or backward in time by a certain number of periods. This can be useful in analyzing time series data, as it can help to identify patterns or trends that may not be immediately apparent when looking at the original data.Noising adds random noise to the shifted data so that the learning model will learn to ignore noisy information and filter out the relevant info from the data.

 2.) **Rolling Mean :** Rolling mean is a technique used to smooth out the data by taking the average of a fixed window of data points. This technique can be useful for reducing noise in the data and identifying trends that are relevant.


1.) Shifting 

In [None]:
def positive_shift(path,replace = False,value = 0):
    df = pd.read_csv(path)
    shift = random.randint(3,15)
    df = df.shift(shift)
    if replace:
       df = df.fillna(df.mean())
    df = df.fillna(0)      
    deviation = df.std().tolist()
    noise = np.random.normal([0]*19,deviation,19)
    df += noise/10
    return df

def negative_shift(path,replace = False,value = 0):
    df = pd.read_csv(path)
    shift = random.randint(-15,-3)
    df = df.shift(shift)
    if replace:
       df = df.fillna(df.mean())
    df = df.fillna(0)   
    deviation = df.std().tolist()
    noise = np.random.normal([0]*19,deviation,19)
    df += noise/10
    return df    

def shift(path,label,file_no,target_dir):
    pn = random.randint(0,1)
    if pn == 1:
        file_name = label+str(file_no)+".csv"
        file_name = target_dir+"/"+file_name
        df = positive_shift(path)
        print(path," -> ",file_name," Positive Shift")
        df.to_csv(file_name,index = False)
    else:
        file_name = label+str(file_no)+".csv"
        file_name = target_dir+"/"+file_name
        df = negative_shift(path)
        print(path," -> ",file_name," Negative Shift")
        df.to_csv(file_name,index = False)    


def add_shift_per_sample(path,label):
    file_no = 25
    all_files = glob.glob(path+"/*")
    for i in all_files:
        shift(i,label,file_no,path)
        file_no += 1
   

In [None]:
add_shift_per_sample("/content/DataBase/SETA",'healthy_open')

/content/DataBase/SETA/healthy_open12.csv  ->  /content/DataBase/SETA/healthy_open25.csv  Positive Shift
/content/DataBase/SETA/healthy_open6.csv  ->  /content/DataBase/SETA/healthy_open26.csv  Negative Shift
/content/DataBase/SETA/healthy_open4.csv  ->  /content/DataBase/SETA/healthy_open27.csv  Positive Shift
/content/DataBase/SETA/healthy_open7.csv  ->  /content/DataBase/SETA/healthy_open28.csv  Negative Shift
/content/DataBase/SETA/healthy_open11.csv  ->  /content/DataBase/SETA/healthy_open29.csv  Negative Shift
/content/DataBase/SETA/healthy_open8.csv  ->  /content/DataBase/SETA/healthy_open30.csv  Negative Shift
/content/DataBase/SETA/healthy_open2.csv  ->  /content/DataBase/SETA/healthy_open31.csv  Negative Shift
/content/DataBase/SETA/healthy_open10.csv  ->  /content/DataBase/SETA/healthy_open32.csv  Negative Shift
/content/DataBase/SETA/healthy_open9.csv  ->  /content/DataBase/SETA/healthy_open33.csv  Negative Shift
/content/DataBase/SETA/healthy_open1.csv  ->  /content/DataBa

In [None]:
add_shift_per_sample("/content/DataBase/SETB",'healthy_closed')

/content/DataBase/SETB/healthy_closed12.csv  ->  /content/DataBase/SETB/healthy_closed25.csv  Positive Shift
/content/DataBase/SETB/healthy_closed2.csv  ->  /content/DataBase/SETB/healthy_closed26.csv  Positive Shift
/content/DataBase/SETB/healthy_closed1.csv  ->  /content/DataBase/SETB/healthy_closed27.csv  Positive Shift
/content/DataBase/SETB/healthy_closed3.csv  ->  /content/DataBase/SETB/healthy_closed28.csv  Negative Shift
/content/DataBase/SETB/healthy_closed5.csv  ->  /content/DataBase/SETB/healthy_closed29.csv  Negative Shift
/content/DataBase/SETB/healthy_closed11.csv  ->  /content/DataBase/SETB/healthy_closed30.csv  Negative Shift
/content/DataBase/SETB/healthy_closed9.csv  ->  /content/DataBase/SETB/healthy_closed31.csv  Positive Shift
/content/DataBase/SETB/healthy_closed6.csv  ->  /content/DataBase/SETB/healthy_closed32.csv  Negative Shift
/content/DataBase/SETB/healthy_closed4.csv  ->  /content/DataBase/SETB/healthy_closed33.csv  Positive Shift
/content/DataBase/SETB/hea

In [None]:
add_shift_per_sample("/content/DataBase/SETC",'alzeimer_open')

/content/DataBase/SETC/alzeimer_open1.csv  ->  /content/DataBase/SETC/alzeimer_open25.csv  Positive Shift
/content/DataBase/SETC/alzeimer_open8.csv  ->  /content/DataBase/SETC/alzeimer_open26.csv  Positive Shift
/content/DataBase/SETC/alzeimer_open3.csv  ->  /content/DataBase/SETC/alzeimer_open27.csv  Negative Shift
/content/DataBase/SETC/alzeimer_open2.csv  ->  /content/DataBase/SETC/alzeimer_open28.csv  Positive Shift
/content/DataBase/SETC/alzeimer_open12.csv  ->  /content/DataBase/SETC/alzeimer_open29.csv  Negative Shift
/content/DataBase/SETC/alzeimer_open11.csv  ->  /content/DataBase/SETC/alzeimer_open30.csv  Negative Shift
/content/DataBase/SETC/alzeimer_open10.csv  ->  /content/DataBase/SETC/alzeimer_open31.csv  Positive Shift
/content/DataBase/SETC/alzeimer_open5.csv  ->  /content/DataBase/SETC/alzeimer_open32.csv  Positive Shift
/content/DataBase/SETC/alzeimer_open4.csv  ->  /content/DataBase/SETC/alzeimer_open33.csv  Negative Shift
/content/DataBase/SETC/alzeimer_open7.csv  

In [None]:
add_shift_per_sample("/content/DataBase/SETD",'alzeimer_closed')

/content/DataBase/SETD/alzeimer_closed11.csv  ->  /content/DataBase/SETD/alzeimer_closed25.csv  Negative Shift
/content/DataBase/SETD/alzeimer_closed1.csv  ->  /content/DataBase/SETD/alzeimer_closed26.csv  Negative Shift
/content/DataBase/SETD/alzeimer_closed2.csv  ->  /content/DataBase/SETD/alzeimer_closed27.csv  Positive Shift
/content/DataBase/SETD/alzeimer_closed12.csv  ->  /content/DataBase/SETD/alzeimer_closed28.csv  Negative Shift
/content/DataBase/SETD/alzeimer_closed3.csv  ->  /content/DataBase/SETD/alzeimer_closed29.csv  Positive Shift
/content/DataBase/SETD/alzeimer_closed6.csv  ->  /content/DataBase/SETD/alzeimer_closed30.csv  Negative Shift
/content/DataBase/SETD/alzeimer_closed10.csv  ->  /content/DataBase/SETD/alzeimer_closed31.csv  Positive Shift
/content/DataBase/SETD/alzeimer_closed8.csv  ->  /content/DataBase/SETD/alzeimer_closed32.csv  Negative Shift
/content/DataBase/SETD/alzeimer_closed7.csv  ->  /content/DataBase/SETD/alzeimer_closed33.csv  Negative Shift
/conten

2.) Rolling Mean

In [None]:
def mean_roll(path,label,file_no,target_dir):
    df = pd.read_csv(path)
    window = random.randint(3,8)
    df = df.rolling(window = 5,center = True).mean().fillna(0)
    file_name = label+str(file_no)+".csv"
    file_name = target_dir+"/"+file_name
    print(path," -> ",file_name)
    df = df.fillna(method = 'ffill')
    df = df.fillna(0)
    df.to_csv(file_name,index = False)

def add_rolling_per_sample(path,label):
    file_no = 49
    all_files = glob.glob(path+"/*")
    for i in all_files:
        mean_roll(i,label,file_no,path)
        file_no += 1

In [None]:
add_rolling_per_sample("/content/DataBase/SETA",'healthy_open')

/content/DataBase/SETA/healthy_open25.csv  ->  /content/DataBase/SETA/healthy_open49.csv
/content/DataBase/SETA/healthy_open32.csv  ->  /content/DataBase/SETA/healthy_open50.csv
/content/DataBase/SETA/healthy_open28.csv  ->  /content/DataBase/SETA/healthy_open51.csv
/content/DataBase/SETA/healthy_open12.csv  ->  /content/DataBase/SETA/healthy_open52.csv
/content/DataBase/SETA/healthy_open6.csv  ->  /content/DataBase/SETA/healthy_open53.csv
/content/DataBase/SETA/healthy_open4.csv  ->  /content/DataBase/SETA/healthy_open54.csv
/content/DataBase/SETA/healthy_open34.csv  ->  /content/DataBase/SETA/healthy_open55.csv
/content/DataBase/SETA/healthy_open36.csv  ->  /content/DataBase/SETA/healthy_open56.csv
/content/DataBase/SETA/healthy_open31.csv  ->  /content/DataBase/SETA/healthy_open57.csv
/content/DataBase/SETA/healthy_open7.csv  ->  /content/DataBase/SETA/healthy_open58.csv
/content/DataBase/SETA/healthy_open11.csv  ->  /content/DataBase/SETA/healthy_open59.csv
/content/DataBase/SETA/h

In [None]:
add_rolling_per_sample("/content/DataBase/SETB",'healthy_closed')

/content/DataBase/SETB/healthy_closed12.csv  ->  /content/DataBase/SETB/healthy_closed49.csv
/content/DataBase/SETB/healthy_closed32.csv  ->  /content/DataBase/SETB/healthy_closed50.csv
/content/DataBase/SETB/healthy_closed2.csv  ->  /content/DataBase/SETB/healthy_closed51.csv
/content/DataBase/SETB/healthy_closed1.csv  ->  /content/DataBase/SETB/healthy_closed52.csv
/content/DataBase/SETB/healthy_closed35.csv  ->  /content/DataBase/SETB/healthy_closed53.csv
/content/DataBase/SETB/healthy_closed26.csv  ->  /content/DataBase/SETB/healthy_closed54.csv
/content/DataBase/SETB/healthy_closed3.csv  ->  /content/DataBase/SETB/healthy_closed55.csv
/content/DataBase/SETB/healthy_closed5.csv  ->  /content/DataBase/SETB/healthy_closed56.csv
/content/DataBase/SETB/healthy_closed11.csv  ->  /content/DataBase/SETB/healthy_closed57.csv
/content/DataBase/SETB/healthy_closed9.csv  ->  /content/DataBase/SETB/healthy_closed58.csv
/content/DataBase/SETB/healthy_closed6.csv  ->  /content/DataBase/SETB/heal

In [None]:
add_rolling_per_sample("/content/DataBase/SETC",'alzeimer_open')

/content/DataBase/SETC/alzeimer_open27.csv  ->  /content/DataBase/SETC/alzeimer_open49.csv
/content/DataBase/SETC/alzeimer_open1.csv  ->  /content/DataBase/SETC/alzeimer_open50.csv
/content/DataBase/SETC/alzeimer_open8.csv  ->  /content/DataBase/SETC/alzeimer_open51.csv
/content/DataBase/SETC/alzeimer_open29.csv  ->  /content/DataBase/SETC/alzeimer_open52.csv
/content/DataBase/SETC/alzeimer_open3.csv  ->  /content/DataBase/SETC/alzeimer_open53.csv
/content/DataBase/SETC/alzeimer_open30.csv  ->  /content/DataBase/SETC/alzeimer_open54.csv
/content/DataBase/SETC/alzeimer_open26.csv  ->  /content/DataBase/SETC/alzeimer_open55.csv
/content/DataBase/SETC/alzeimer_open35.csv  ->  /content/DataBase/SETC/alzeimer_open56.csv
/content/DataBase/SETC/alzeimer_open34.csv  ->  /content/DataBase/SETC/alzeimer_open57.csv
/content/DataBase/SETC/alzeimer_open2.csv  ->  /content/DataBase/SETC/alzeimer_open58.csv
/content/DataBase/SETC/alzeimer_open12.csv  ->  /content/DataBase/SETC/alzeimer_open59.csv
/co

In [None]:
add_rolling_per_sample("/content/DataBase/SETD",'alzeimer_closed')

/content/DataBase/SETD/alzeimer_closed27.csv  ->  /content/DataBase/SETD/alzeimer_closed49.csv
/content/DataBase/SETD/alzeimer_closed35.csv  ->  /content/DataBase/SETD/alzeimer_closed50.csv
/content/DataBase/SETD/alzeimer_closed11.csv  ->  /content/DataBase/SETD/alzeimer_closed51.csv
/content/DataBase/SETD/alzeimer_closed36.csv  ->  /content/DataBase/SETD/alzeimer_closed52.csv
/content/DataBase/SETD/alzeimer_closed30.csv  ->  /content/DataBase/SETD/alzeimer_closed53.csv
/content/DataBase/SETD/alzeimer_closed1.csv  ->  /content/DataBase/SETD/alzeimer_closed54.csv
/content/DataBase/SETD/alzeimer_closed2.csv  ->  /content/DataBase/SETD/alzeimer_closed55.csv
/content/DataBase/SETD/alzeimer_closed12.csv  ->  /content/DataBase/SETD/alzeimer_closed56.csv
/content/DataBase/SETD/alzeimer_closed29.csv  ->  /content/DataBase/SETD/alzeimer_closed57.csv
/content/DataBase/SETD/alzeimer_closed3.csv  ->  /content/DataBase/SETD/alzeimer_closed58.csv
/content/DataBase/SETD/alzeimer_closed25.csv  ->  /co

In [None]:
shutil.make_archive('Augmented', 'zip','/content/DataBase')

'/content/Augmented.zip'

**Splitting data to train and test**

In [None]:
def return_splits():
    augmented_files = glob.glob("/content/DataBase/*/*")
    train_files = list(set(augmented_files) - set(files))
    test_files = files
    for i in range(0,len(test_files),12):
        train_files += test_files[i:i+4]
    test_files = list(set(test_files)-set(train_files))    
    return train_files,test_files 
train,test = return_splits()     

In [None]:
len(train),len(test)

(160, 32)

# Compressing Time Series Data 
PCA can be used to compress time series data of many time steps to a vector of length musch lesser by identifying the most important patterns or features in the data and representing it using a smaller set of variables or dimensions, called principal components.
Suppose we have a time series dataset with 10,000 time steps and each time step has multiple variables. We can apply PCA to this data to identify the most important patterns or features across all the variables. PCA will generate a set of principal components, with each component representing a linear combination of the original variables that captures the most variance in the data.


In [None]:
def decompose(wave_data):    
    time_series_data = wave_data.T
    kpca = KernelPCA(n_components=5, kernel='rbf', gamma=0.1)
    compressed_data = kpca.fit_transform(time_series_data).mean(axis = 1)
    return compressed_data.tolist()

In [None]:
def handle_nan(df_f):
    df = pd.read_csv(df_f)
    count = 0
    if df.isnull().sum().sum() != 0:
        print(df_f,end = " ")
        while df.isnull().sum().sum() != 0:
            count = count + 1
            df = df.fillna(method = 'ffill')
            df = df.fillna(method = 'bfill')
        print("   " , count, "null values found and taken care of","\n")    
    return df

In [None]:
def create_ds(all_files):
    dataset = []
    for file in all_files:
        df = handle_nan(file)
        scaler = StandardScaler()
        df = scaler.fit_transform(df)
        sample = decompose(df)

        if "closed" in file:
            sample.append("closed")
        elif "open" in file:
            sample.append("open")

        if "healthy" in file:
            sample.append("healthy")
        elif "alzeimer" in file:
            sample.append("alzeimer")

        dataset.append(sample)
    return pd.DataFrame(dataset)        

In [None]:
train_df = create_ds(train)

/content/DataBase/SETC/alzeimer_open8.csv     1 null values found and taken care of 



In [None]:
train_df.to_csv("train.csv",index= False)

In [None]:
test_df = create_ds(test)

/content/DataBase/SETC/alzeimer_open5.csv     1 null values found and taken care of 

/content/DataBase/SETC/alzeimer_open6.csv     1 null values found and taken care of 

/content/DataBase/SETC/alzeimer_open10.csv     1 null values found and taken care of 



In [None]:
test_df.to_csv("test.csv",index= False)