In [1]:
import pandas as pd
import numpy as np
import struct
import matplotlib
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import pickle

import tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import settings, ComprehensiveFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

%matplotlib inline

In [2]:
matplotlib.rcParams['figure.figsize'] = (20, 10)

In [5]:
path = "../SUHA_Paris2017"

all_files = [path + "/" + f for f in listdir(path) if isfile(join(path, f))]

start = 29 # part * int(len(all_files) / nb_part)
stop = 69 # (part + 1) * int(len(all_files) / nb_part)
some_files = all_files[start:stop:4]
some_files

['../SUHA_Paris2017/conBR_ir031_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir035_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir039_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir043_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir048_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir052_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir056_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir029_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir060_tostack.rsf@',
 '../SUHA_Paris2017/conBR_ir064_tostack.rsf@']

In [6]:
n1_convH = 75
n2_convH = 5400
n3_convH = 100

In [7]:

def load_data(filename):
    """
    Charger le fichier de données
    """
    n_elems = n1_convH * n2_convH * n3_convH
    f = open(filename, "rb")
    data = struct.unpack('f' * n_elems, f.read(4 * n_elems))
    f.close()
    return np.reshape(data, (n3_convH, n2_convH, n1_convH))

In [8]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [9]:
convH = load_data(all_files[50])

In [10]:
def normalize_signals(data):
    """
    Normaliser le signal par rapport au temps
    """
    for receiver in range(n3_convH):
        for trace in range(n2_convH):
            data[receiver, trace] = data[receiver, trace, :] / np.amax(data[receiver, trace], axis=0)
    return data

In [11]:
n_convH = normalize_signals(convH)

In [12]:
def compute_sums(data, n_tracesByGroups=20):
    """
    Calculer la somme des traces par groupe de 20
    """
    n3, n2, n1 = data.shape
    n_groups = int(n2 / n_tracesByGroups)

    sum_traces_grouped = np.zeros((n3, n_groups, n1))
    sum_traces = np.zeros((n3, n1))
    
    for receiver in range(n3):
        traces = data[receiver]
        sum_traces[receiver, :] = traces.sum(axis=0) / np.amax(np.abs(traces.sum(axis=0)))
        for n_group in range(n_groups):
            start = n_tracesByGroups * n_group
            stop = n_tracesByGroups * (n_group + 1)
            sum_traces_grouped[receiver, n_group, :] = traces[start:stop, :].sum(axis=0) / np.amax(np.abs(traces[start:stop, :].sum(axis=0)))
    
    return sum_traces, sum_traces_grouped

In [13]:
def labelize(sumH, receiver):
    """
    Etiquetter l'échantillon
    """
    decision = np.median(sumH[receiver]) # 2.5
    y_todf = np.zeros(sumH[receiver].shape)
    y_todf = y_todf + (sumH[receiver] < decision).astype(np.int)
    return pd.DataFrame(y_todf)[0]

In [14]:
def extractFeatures(sum_traces_grouped, receiver, main_features=None):
    """
    Extraire les caractéristiques pour chaque récepteur
    """
    master_df = pd.DataFrame(sum_traces_grouped[receiver, 0])
    master_df['id'] = 0
    for ii in range(1, sum_traces_grouped.shape[1]):
        temp_df = pd.DataFrame(sum_traces_grouped[receiver, ii])
        temp_df['id'] = ii
        master_df = pd.DataFrame(np.vstack([master_df, temp_df]))
    # 75 * 270 -1 = 20 249
    
    extraction_settings = EfficientFCParameters()
    extraction_settings.IMPUTE = impute # Interpolation pour éviter les valeurs NaN
    
    kind_to_fc_parameters = {}
    if main_features !=None:
        kind_to_fc_parameters = load_obj(main_features)
    
    return extract_features(master_df, column_id=1,
                            default_fc_parameters=extraction_settings,
                            kind_to_fc_parameters=kind_to_fc_parameters).sort_index()

In [15]:
def compute_SSE(u_vgt, u):
    """
    Calculer la somme de l'écart quadratique
    """
    n3, n2, n1 = u.shape
    SSE = np.zeros((n3, n2))
    for receiver in range(n3):
        for group in range(n2):
            SSE[receiver, group] = np.square(u_vgt[receiver] - u[receiver, group, :]).sum()
    return SSE

In [33]:
def fitPredictAllReceivers(receivers, sumH, sum_traces_grouped, traces, echantillonage=2, debug=True):
    """
    Entraîner et prédire un classifieur pour chaque récepteur
    """
    result = np.zeros((int(len(receivers) / echantillonage), 75))
    
    for receiver in receivers:
        if receiver % echantillonage == 0:
            print("Recepteur %d" %(receiver))
            y = labelize(sumH, receiver)
            X = extractFeatures(sum_traces_grouped, receiver, "main_features")
            new_X = extractFeatures(traces, receiver, "main_features")

            clf = ExtraTreesClassifier(n_estimators=200)
            clf.fit(X, y)
            new_y = clf.predict(new_X)
            
            result[int(receiver / echantillonage)] = np.sum(traces[receiver, ii, :] for ii in range(traces[receiver].shape[0]) if not new_y[ii])
            
            
            if debug:
                y_pred = cross_val_predict(clf, X, y, cv=10)
                classification_metrics(y, y_pred)
        
    return result

In [34]:
def fitPredictAllFiles(files, n_tracesByGroups=20, echantillonage=2, debug=True):
    """
    Entraîner et prédire un classifieur pour chaque récepteur des fichiers
    """
    
    for indice, file in enumerate(files):
        print("Fichier %s" %(file))
        convH = load_data(file)
        n_convH = normalize_signals(convH)
        sum_traces, sum_traces_grouped = compute_sums(n_convH, n_tracesByGroups)
        sumH = compute_SSE(sum_traces, sum_traces_grouped)
        
        result=fitPredictAllReceivers(range(n3_convH), sumH, sum_traces_grouped,
                               n_convH, echantillonage, debug=debug)
        
        result.astype('float32').tofile('result_' + str(indice))

In [35]:
fitPredictAllFiles(some_files[0:2], n_tracesByGroups=20, echantillonage=10, debug=False)

Fichier ../SUHA_Paris2017/conBR_ir031_tostack.rsf@
Recepteur 0


Feature Extraction: 100%|██████████| 270/270 [00:04<00:00, 55.20it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 106.81it/s] 


Recepteur 10


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 110.32it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 111.66it/s] 


Recepteur 20


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 108.27it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:47<00:00, 113.11it/s] 


Recepteur 30


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 99.59it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:53<00:00, 100.38it/s] 


Recepteur 40


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 102.55it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 106.90it/s] 


Recepteur 50


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 104.99it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 110.25it/s] 


Recepteur 60


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 107.25it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 111.25it/s] 


Recepteur 70


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 109.56it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 112.40it/s] 


Recepteur 80


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 98.34it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 110.88it/s] 


Recepteur 90


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 106.96it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:49<00:00, 109.29it/s] 


Fichier ../SUHA_Paris2017/conBR_ir035_tostack.rsf@
Recepteur 0


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 106.38it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 110.72it/s] 


Recepteur 10


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 108.97it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 111.22it/s] 


Recepteur 20


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 106.46it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:49<00:00, 109.70it/s] 


Recepteur 30


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 108.27it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 107.20it/s] 


Recepteur 40


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 97.58it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:49<00:00, 109.39it/s] 


Recepteur 50


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 107.27it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 106.54it/s] 


Recepteur 60


Feature Extraction: 100%|██████████| 270/270 [00:03<00:00, 84.68it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 107.59it/s] 


Recepteur 70


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 110.43it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:51<00:00, 105.72it/s] 


Recepteur 80


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 110.24it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:48<00:00, 111.10it/s] 


Recepteur 90


Feature Extraction: 100%|██████████| 270/270 [00:02<00:00, 108.96it/s]
Feature Extraction: 100%|██████████| 5400/5400 [00:50<00:00, 107.82it/s] 
