#Connect to drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/[2023-2024] AN2DL/Challenge2

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/.shortcut-targets-by-id/1LRDqoHBsKCFWi3z-Hv1rH_faI3hVcyc4/Challenge2


#Import libraries

In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler

2.15.0


#Load data

##unzip data

In [None]:
import zipfile
unzip=False
if unzip:
  extracted_dir = 'training_dataset'
  with zipfile.ZipFile('training_dataset.zip', 'r') as zip_ref:
      zip_ref.extractall(extracted_dir)
  print(f"Successfully extracted contents to {extracted_dir}")

##load

In [None]:
%cd /gdrive/My Drive/[2023-2024] AN2DL/Challenge2/training_dataset

/gdrive/.shortcut-targets-by-id/1LRDqoHBsKCFWi3z-Hv1rH_faI3hVcyc4/Challenge2/training_dataset


###categories

In [None]:
categories=np.load('categories.npy')

In [None]:
categories.shape

(48000,)

In [None]:
unique_elements, counts = np.unique(categories, return_counts=True)
for element, count in zip(unique_elements, counts):
    print(f"Element: {element}, Count: {count}")

Element: A, Count: 5728
Element: B, Count: 10987
Element: C, Count: 10017
Element: D, Count: 10016
Element: E, Count: 10975
Element: F, Count: 277


In [None]:
cambi=0
first=categories[0]
print(first)
for i in range(1,categories.shape[0]):
  if categories[i-1]!=categories[i]:
    cambi+=1
    next=categories[i]
    print(next, 'i: ',i)


D
E i:  10016
A i:  20991
C i:  26719
B i:  36736
F i:  47723


###valid_periods

In [None]:
valid_periods=np.load('valid_periods.npy')

In [None]:
valid_periods.shape

(48000, 2)

###training_data

In [None]:
training_data=np.load('training_data.npy')

#Extract valid signals
capito l'asse delle x in riferimento a valid_period voglio capire ora come sono i diversi segnali lungo la y e se la loro distribuzione può essere comparabile. per fare questo devo prima estrarre le singole finestre in cui il segnale non è nullo (se un segnale lungo 2500 è 0 per 2000 campioni la sua media sarà per lo più nulla, poi così vedo anche la distribuzione delle lunghezze del valid period)

In [None]:
def extract_valid_signals(training_data,valid_periods):
  #input:
  #training_data numpy(48000,2776)
  #valid_periods numpy(48000,2)
  #output:
  #valid_signals numpy(48000,) it is composed, in each row, by an array of different shape containing only the valid signal (not padded)

  valid_signal=[0]*training_data.shape[0]
  for i in range(training_data.shape[0]):
    valid_signal[i]=training_data[i,valid_periods[i][0]:valid_periods[i][1]]

  return np.array(valid_signal)

In [None]:
valid_signal=extract_valid_signals(training_data,valid_periods)

In [None]:
valid_signal.shape

(48000,)

# STRATIFICATION

In [None]:
x_d = valid_signal[:10016]
x_e = valid_signal[10016:20991]
x_a = valid_signal[20991:26719]
x_c = valid_signal[26719:36736]
x_b = valid_signal[36736:47723]
x_f = valid_signal[47723:]

In [None]:
x_a.shape

(5728,)

In [None]:
p_train = 0.7
p_val = 0.15
p_test = 0.15

In [None]:
X_train = []
X_val = []
X_test = []
classes = [x_a,x_b,x_c,x_d,x_e,x_f]
for i in range(6):
  temp_1 = classes[i]
  n_train = np.random.choice(np.arange(0, classes[i].shape[0]), size=int(classes[i].shape[0]*p_train), replace=False)
  temp_1 = np.delete(temp_1, n_train)
  X_train.append(classes[i][n_train])
  classes[i] = temp_1

  temp_2 = classes[i]
  n_val = np.random.choice(np.arange(0, classes[i].shape[0]), size=int(classes[i].shape[0]*(p_val/(1-p_train))), replace=False)
  temp_2 = np.delete(temp_2, n_val)
  X_val.append(classes[i][n_val])
  classes[i] = temp_2

  X_test.append(classes[i])

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

In [None]:
print(X_train[0].shape, X_val[0].shape, X_test[0].shape)

(4009,) (859,) (860,)


In [None]:
X_test.shape

(6,)

In [None]:
X_train_final = []
for sublist in X_train:
    X_train_final.extend(sublist)
X_train_final = np.array(X_train_final)

In [None]:
X_val_final = []
for sublist in X_val:
    X_val_final.extend(sublist)
X_val_final = np.array(X_val_final)

In [None]:
print(X_train_final.shape, X_val_final.shape)

(33596,) (7198,)


In [None]:
X_train = X_train_final
X_val = X_val_final

In [None]:
np.save('/gdrive/MyDrive/[2023-2024] AN2DL/Challenge2/X_train.npy', X_train)
np.save('/gdrive/MyDrive/[2023-2024] AN2DL/Challenge2/X_val.npy', X_val)