In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from threading import Thread
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
        #print(os.path.join(dirname, filename))
#        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_dir = "/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train"
train_filenames = os.listdir(train_dir)


In [None]:
train_filename = train_filenames[0]
data = pd.read_csv( os.path.join(train_dir,train_filename), dtype="Int16")
data.describe()

# Checando que tan buena estuvo la prank

Según distintos archivos, hay ciertos que no tienen valores registrados (NaNs) o cosas raras como puros ceros. Sería importante ver como está dicha situación para todos los datos

In [2]:
def check_prank(data):
    """Function to check if some columns contain only nans or some constant"""
    nrows, _ = data.shape
    only_nans = [data[sensor].isna().sum() for sensor in data.columns]
    only_constants = [data[sensor].std() == 0 for sensor in data.columns]
    
    return only_nans, only_constants

In [3]:
def yield_partition(data, npartitions):
    n_elements = len(data)
    n = n_elements//npartitions
    
    for k in range(npartitions):
        i = k*n
        j = (k+1)*n
        if k != npartitions - 1:
            yield data[i:j]
        else:
            yield data[i:]  # Last one

In [4]:
def parallelize(nthreads):
    def decorator(original_function):
        def parallelized_function(dir_path, filenames, verbose=True):
            parallel_ans = []
            threads = []
            counter = 1
    
            # Partir los datos
            for partition in yield_partition(filenames, nthreads):
                thread = Thread(target=original_function, args=(dir_path, partition, verbose, parallel_ans))
                threads.append(thread)
                print("Starting",counter)
                thread.start()
                counter += 1 
            
            for thread in threads:
                thread.join()
            
            list_nans = [df_nan for df_nan, df_constant in parallel_ans]
            list_constants = [df_constant for df_nan, df_constant in parallel_ans]
            concatenation_nans = pd.concat(list_nans)
            concatenation_constants = pd.concat(list_constants)
            return concatenation_nans, concatenation_constants
        
        return parallelized_function
    
    return decorator

In [5]:
@parallelize(8)
def check_pranks(dir_path, filenames, verbose=True, parallel_ans = None):
    nans = []
    constants = []
    for i,filename in enumerate(filenames,1):
        df = pd.read_csv(os.path.join(dir_path,filename), dtype="Int16")
        only_nans, only_constants = check_prank(df)
        nans.append(only_nans)
        constants.append(only_constants)
        if verbose and i%10 == 0:
            print(i)
    
    df_nans = pd.DataFrame(nans, columns=df.columns)
    df_constants = pd.DataFrame(constants, columns=df.columns)
    
    if parallel_ans != None:
        parallel_ans.append( (df_nans, df_constants) )
    else:
        return df_nans, df_constants

## Datos de entrenamiento 

In [None]:
%%time
df_train_nans, df_train_constants = check_pranks(train_dir, train_filenames)

In [None]:
df_train_nans.to_csv("df_train_nans.csv", index=False)

In [None]:
df_train_nans.sum(axis=0)

In [None]:
df_train_constants.sum(axis=0)

In [None]:
df_train_constants.to_csv("df_train_constants.csv", index=False)

## Datos de prueba

In [6]:
test_dir = "/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/test"
test_filenames = os.listdir(test_dir)

In [7]:
%%time
df_test_nans, df_test_constants = check_pranks(test_dir, test_filenames)

Starting 1
Starting 2
Starting 3
Starting 4
Starting 5
Starting 6
Starting 7
Starting 8
10
10
10
10
10
10
10
10
20
20
20
2020

20
20
20
30
30
30
30
30
30
30
30
40
40
40
40
40
40
40
40
50
50
50
50
50
50
50
50
60
60
60
60
60
60
60
60
70
70
70
70
70
70
70
70
80
80
80
80
80
80
80
80
90
90
90
9090

90
90
90
100
100
100
100100

100
100
100
110
110
110
110
110
110
110
110
120
120
120
120
120
120120

120
130
130
130
130
130
130
130
130
140
140
140
140
140140

140
140
150
150
150
150
150
150
150
150
160160

160
160
160
160
160
160
170
170170

170170
170

170
170
180
180
180
180
180
180
180180

190
190
190
190
190
190
190
190
200200

200
200200

200
200200

210
210
210
210
210210

210
210
220
220
220
220
220220

220
220
230
230
230
230
230
230230

230
240
240
240
240
240240

240
240
250250250


250
250
250
250
250
260
260
260
260260

260260

260
270
270
270
270
270
270
270
270
280
280
280
280
280
280
280280

290
290
290
290
290
290
290
290
300
300
300
300
300
300
300
300
310
310
310
310
310
310


In [10]:
df_test_nans.to_csv("df_test_nans.csv", index=False)
df_test_constants.to_csv("df_test_constants.csv", index=False)

In [8]:
df_test_nans.sum(axis=0)

sensor_1     23328531
sensor_2     75807628
sensor_3     26593834
sensor_4        13406
sensor_5     92651510
sensor_6        57658
sensor_7      2713162
sensor_8     17653700
sensor_9     51208744
sensor_10    46050548
dtype: int64

In [9]:
df_test_constants.sum(axis=0)

sensor_1       0
sensor_2       0
sensor_3       0
sensor_4       0
sensor_5       0
sensor_6       0
sensor_7       0
sensor_8       0
sensor_9       0
sensor_10    815
dtype: int64

----------------
# Zona después del \end{document}

`time_to_eruption` es el número de muestras antes de la siguiente erupción