# Extracción de características

In [1]:
import numpy as np                # Numpy nunca puede faltar
import pandas as pd               # El siempre confiable Pandas
from os import listdir, path      # Para leer/escribir archivos del sistema

import librosa                    # Se usa para análisis de canciones pero hay una cosa que nos viene bien de aquí

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import statsmodels as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa import stattools

import librosa
from scipy.signal import find_peaks, find_peaks_cwt, peak_prominences, periodogram, stft, peak_widths

In [2]:
def impute_df(df, whole_stats):
    nrows, ncols = df.shape
    nans_per_sensor = df.isna().sum(axis=0)
    for sensor in df.columns:
        n_nans = nans_per_sensor[sensor]
        if n_nans:
            mean = whole_stats.loc[sensor,"mean"]
            if n_nans == nrows:
                df[sensor].fillna(mean, inplace=True)
    
            else:
                data = df[sensor].values
                nan_mask = np.isnan(data)
                not_nan_indexes = np.argwhere(~nan_mask).squeeze()
                first_not_nan_index = not_nan_indexes[0]
                last_not_nan_index = not_nan_indexes[-1]
                nans_batch1 = nan_mask.copy()
                nans_batch1[last_not_nan_index+1:] = False
                data[nans_batch1] = mean
                
                # holt winters
                if last_not_nan_index != nrows-1:
                    #print(first_not_nan_index,last_not_nan_index)
                    autocorrelation = stattools.acf(data[:last_not_nan_index], nlags=40, fft=False)
                    higher_correlation = np.sort(autocorrelation)[-2]
                    period = np.argwhere(autocorrelation == higher_correlation).squeeze() + 1
                    holtwinters = ExponentialSmoothing(data[:last_not_nan_index], trend=None, seasonal="add", 
                                                       seasonal_periods=period, 
                                                       initialization_method="estimated").fit(optimized=True)
                    #print(last_not_nan_index)
                    forecasting = holtwinters.predict(start=last_not_nan_index+1, end=nrows-1)
                    #print(forecasting.shape)
                    #print(nan_mask.sum())
                    data[last_not_nan_index+1:] = forecasting
    
                df[sensor] = data
        
    return df

In [3]:
def estandarizar(df):
    aggs = df.agg([np.nanmean, np.nanstd]).astype("float16")
    standarized_df = (df - aggs.loc["nanmean",:])/ aggs.loc["nanstd",:]
    return standarized_df

In [4]:
def get_chars(df, nombre):
    cars = [int(nombre[:-4])]
    # zeros_crossings
    cars.extend(librosa.zero_crossings(df.values, axis = 0).sum(axis = 0))
    
    # find_peaks
    cars.extend(df.apply(find_peaks, axis = 0).iloc[0,:].apply(len).values)
            
    # peak_widths_max
    λ0 = lambda x: np.max(peak_widths(x, find_peaks(x)[0])[0]) if len(find_peaks(x)[0]) != 0 else 0
    cars.extend(df.apply(λ0).values)
                
    # peak_widths_mean
    λ01 = lambda x: np.mean(peak_widths(x, find_peaks(x)[0])[0]) if len(find_peaks(x)[0]) != 0 else 0
    cars.extend(df.apply(λ01).values)
                
    # peak_prominences_max
    λ1 = lambda x: np.max(peak_prominences(x, find_peaks(x)[0])[0]) if len(find_peaks(x)[0]) != 0 else 0
    cars.extend(df.apply(λ1).values)
                
    # peak_prominences_mean
    λ11 = lambda x: np.mean(peak_prominences(x, find_peaks(x)[0])[0]) if len(find_peaks(x)[0]) != 0 else 0
    cars.extend(df.apply(λ11).values)
                
    # periodogram_max
    λ2 = lambda x: np.max(periodogram(x, 100)[1]) if ~x.isna().all() else 0
    cars.extend(np.sqrt(df.apply(λ2).values)) # Es un estimado del RMS
    
    # periodogram_mean
    λ3 = lambda x: np.mean(periodogram(x, 100)[1]) if ~x.isna().all() else 0
    cars.extend(df.apply(λ3).values)
                
    return cars

In [6]:
whole_stats_path = "../csvs/whole_stats.csv" # Óscar
#whole_stats_path = ""                        # Dante
whole_stats = pd.read_csv(whole_stats_path, index_col=0)
whole_stats

Unnamed: 0,count,mean,std,min,max
sensor_1,265864431.0,0.02525,203.303099,-1171.0,1278.0
sensor_2,265864431.0,-0.431776,418.222197,-4759.0,4118.0
sensor_3,265864431.0,0.233829,178.179004,-1288.0,1212.0
sensor_4,265864431.0,0.281562,200.602547,-1280.0,1532.0
sensor_5,265864431.0,-0.00065,161.583577,-1225.0,1370.0
sensor_6,265864431.0,0.044916,299.005016,-3226.0,2575.0
sensor_7,265864431.0,4.59019,204.95691,-986.0,983.0
sensor_8,265864431.0,-2.9755,247.54213,-934.0,1060.0
sensor_9,265864431.0,2.170547,196.562165,-1210.0,1316.0
sensor_10,265864431.0,-120.953801,937.478541,-3743.0,3229.0


In [8]:
%%time
path_originales = "/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe" # Óscar
path_copias = "/home/oscar/Escritorio/estandarizados2"                        # Óscar
#path_originales = ""                                                         # Dante
#path_copias = ""                                                             # Dante
for carpeta in ["train", "test"]:
    camino = path.join(path_originales, carpeta)
    print(f"Comienzo con {carpeta}")
    for file in listdir(camino):
        lectura = path.join(camino, file)
        escritura = path.join(path_copias, carpeta, file)
        df = pd.read_csv(lectura)
        df = impute_df(df, whole_stats)
        s_df = estandarizar(df).astype("Float16")
        s_df.to_csv(escritura, index = False)

Comienzo con train








Comienzo con test








CPU times: user 1h 29min 2s, sys: 46.5 s, total: 1h 29min 49s
Wall time: 1h 1min 54s


In [9]:
%%time

path_train = "/home/oscar/Escritorio/estandarizados2/train" # Óscar
#path_train = ""                                             # Dante
todos = []
i = 0
for archivo in listdir(path_train):
    camino = path.join(path_train, archivo)
    df = pd.read_csv(camino)
    try:
        chars = get_chars(df, archivo)
        todos.append(chars)
    except Exception as e:
        print(e,archivo)
    #print(i)
    i += 1
    
caracteristicas = "zcr peaks peak_wmax peak_wmean peak_promax peak_promean period_max period_mean". split()
cols = ["segment_id"] + [f"sensor_{idx}_{car}" for car in caracteristicas for idx in range(1,11)]
df_todos = pd.DataFrame(todos, columns = cols)
df_todos.to_csv("/home/oscar/Escritorio/estandarizados2/stats_per_file_signal_train.csv", index = False)

  ret = data - np.expand_dims(np.mean(data, axis), axis)


CPU times: user 16min 5s, sys: 3.94 s, total: 16min 9s
Wall time: 16min 20s
