# OBJECTIF

L'objectif de ce notebook est d'explorer les données et d'effectuer les premiers test de classification par Machine Learning du set de **test**

In [1]:
# Python libraries:
# to handle datasets
import numpy as np # linear algebra
import random
import time

# Initialization:
np.random.seed(1001)

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os # accessing directory structure
import time


from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import svm, neighbors, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from sklearn.ensemble import BaggingClassifier


# for sound analysis
import librosa
import librosa.display
import IPython.display as ipd
from scipy import fftpack
from scipy import signal

from tqdm import tqdm_notebook #tqdm

# manage file storage
import feather

# for plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#show pandas version
pd.__version__

'1.3.4'

# CREATION DU FICHIER CSV

In [2]:
# Paths definition:
ROOT_AUDIO     = 'DCASE_DATASET' #Directory name where are all wave files
DEV_DATA_PATH       = os.path.join(ROOT_AUDIO, 'dev_data')
EVAL_DATA_PATH      = os.path.join(ROOT_AUDIO, 'eval_data')

def load_sound_file(audio_path, duration=None):
    """
    Loads a sound file
    
    PARAMS
    ======
        audio_path (string) - location to the WAV file to open
    
    RETURNS
    =======
        signal (numpy array) - sound signal
        sampling_rate (float) - sampling rate detected in the file
    """
    multi_channel_data, sampling_rate = librosa.load(audio_path, sr=None, duration=duration)
    signal = np.array(multi_channel_data)
    
    return signal, sampling_rate

def generate_dataset(data_dir):
    """
    Takes a list for WAV files as an input and generate a numpy array with
    the extracted features.
    
    PARAMS
    ======
        data_dir (string) - Root directory to audio files
        
    RETURNS
    =======
        dataset (dataframe) - dataset
    """

    dataset = []
    filepath = []
    
    #create header variable
    header = 'pathname filename machine_id sample_id audio_format machine_type machine_kind data_split condition durations samplingrate'
#    header += ' chroma_stft rms spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'

#    for i in range(1, 21):
#        header += f' mfcc{i}'

    header = header.split()
    
    for audioroot, _, audiofiles in os.walk(data_dir):
        print(audioroot)
        for index in tqdm_notebook(range(len(audiofiles)), desc='Extracting features'):
            filename = audiofiles[index]
            audiopath = os.path.join(audioroot, filename)
            file_ext = audiopath.split("/")[-1].split("_")[-1].split(".")[-1]
            #print(audiopath)
            if file_ext == 'wav':
                # Load audio file, sr = sample rate, signal = numpy array of audio file
                signal, sr = load_sound_file(audiopath)

                # Extract features from this signal:
                machine_id = audiopath.split("/")[-1].split("_")[2]
                sample_id = audiopath.split("/")[-1].split("_")[-1].split(".")[0]
                audio_format = file_ext
                machine_type = audiopath.split("/")[-3]

                if 'toy' in machine_type:
                    machine_kind = 'toys'
                else:
                    machine_kind = 'real_machine'

                data_split = audiopath.split("/")[-2]
                condition = audiopath.split("/")[-1].split("_")[0]
                durations = len(signal)/sr #librosa.get_duration(filename=audiopath) #len(signal)/sr
                samplingrate = sr #librosa.get_samplerate(audiopath) #sr
                to_append = f'{audiopath} {filename} {machine_id} {sample_id} {audio_format} {machine_type} {machine_kind} {data_split} {condition} {durations} {samplingrate}'    


                filepath.append(to_append.split())
            
    dataset = pd.DataFrame(filepath, columns=header)

    return dataset



In [3]:
files = DEV_DATA_PATH
data_location_csv = os.path.join(ROOT_AUDIO, 'dev_data.csv')

if os.path.exists(data_location_csv):
    print('CSV : Train data already exists, loading from file...')
    with open(data_location_csv, 'rb') as f:
         data_csv = pd.read_csv(f, dtype={"machine_id": "str", "sample_id": "str"});
    print('Done.')
        
else:
    data_csv = generate_dataset(files)
    print('Saving csv training data to disk...')
    #with open(data_location_csv, 'wb') as f:
    data_csv.to_csv(data_location_csv);
            
    print('Done.')


CSV : Train data already exists, loading from file...
Done.


In [4]:
data = data_csv.copy()
del data_csv
data.head()

Unnamed: 0.1,Unnamed: 0,pathname,filename,machine_id,sample_id,audio_format,machine_type,machine_kind,data_split,condition,durations,samplingrate
0,0,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,anomaly_id_01_00000176.wav,1,176,wav,ToyConveyor,real_machine,test,anomaly,10.0,16000
1,1,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,normal_id_03_00000329.wav,3,329,wav,ToyConveyor,real_machine,test,normal,10.0,16000
2,2,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,normal_id_01_00000291.wav,1,291,wav,ToyConveyor,real_machine,test,normal,10.0,16000
3,3,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,normal_id_01_00000285.wav,1,285,wav,ToyConveyor,real_machine,test,normal,10.0,16000
4,4,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,anomaly_id_01_00000162.wav,1,162,wav,ToyConveyor,real_machine,test,anomaly,10.0,16000


In [5]:
data.shape

(30987, 12)

Séparation des des données de test et de train

In [6]:
train_data = data[data['data_split'] == 'train'].reset_index(drop=True)
test_data = data[data['data_split'] == 'test'].reset_index(drop=True)
del data
print(train_data.shape, test_data.shape)

(20119, 12) (10868, 12)


In [7]:
train_data = train_data.drop(columns = 'Unnamed: 0')
test_data = test_data.drop(columns = 'Unnamed: 0')

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10868 entries, 0 to 10867
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pathname      10868 non-null  object 
 1   filename      10868 non-null  object 
 2   machine_id    10868 non-null  object 
 3   sample_id     10868 non-null  object 
 4   audio_format  10868 non-null  object 
 5   machine_type  10868 non-null  object 
 6   machine_kind  10868 non-null  object 
 7   data_split    10868 non-null  object 
 8   condition     10868 non-null  object 
 9   durations     10868 non-null  float64
 10  samplingrate  10868 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 934.1+ KB


# PREPARATION DES SET AVEC FEATURES MEL

In [9]:
"""
        n_mels (integer) - number of Mel buckets (default: 64)
        n_fft (integer) - length of the windowed signal to compute the short 
                          Fourier transform on
        hop_length (integer) - window increment when computing STFT
        fmin (integer) - low frequency cutoff point that will be used when generating our Mel spectrograms

"""
n_mels = 64 #128 #64; The number of Mels refers to the number Mel bands, or “bins”, that our Mel scale will be broken up into
n_fft = 2**13 #=2048 2**10 #2**11 #=2048;  2**13 #=8196; The n_fft length of the discrete Fourier transformation window describes the size of the window that will be used when performing each discrete Fourier transformation across the audio signal
hop_length = 2**11 #=512 n_fft // 2 # 2**9 #=512; describes how much this window is to be shifted along the audio signal during each step of STFT processing
fmin = 20 #The minimum frequency describes the low frequency cutoff point that will be used when generating our Mel spectrograms. We will be using a value of 20 Hz for this parameter since this value represents the bottom of the audible frequency spectrum for most humans.


def mel_features_extractor(file, min_s, resizeaudio=False):
    """
    Compute a mel-scaled spectrogram and generate a vector from music dataset 
    
    PARAMS
    ======
        file (string) - location to the WAV file to open
        min_s (int) - fixed length in second to resize the audio file
        resizeaudio (boolean) - if True we truncate the audio file to min_ms time
        
    RETURNS
    =======
        mel_features (dataframe) - dataset
    """
    if resizeaudio:
        audio, sample_rate = load_sound_file(file, duration=min_s)
    else:
        audio, sample_rate = load_sound_file(file)
        
    mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_features = librosa.power_to_db(mel, ref=np.max)

    return mel_features


def create_mel_data(data, file, ext, duration=None):
    
    
    file = file+'.'+ext
    mel_data_train_location_ftr = os.path.join(ROOT_AUDIO, file)


    if os.path.exists(mel_data_train_location_ftr):
        print(file + ' already exists, loading from file...')
        #with open(mel_data_train_location_ftr, 'rb') as f:
            #df = pd.read_feather(f, use_threads=True);
        #df.to_csv(mel_data_train_location_ftr)
        df = pd.read_csv(mel_data_train_location_ftr, index_col = 0)
        print('Done.')
            
    else:
       
        print('Saving ' + file + ' to disk...')
        mel_data = data[['pathname','machine_type','machine_kind','condition']]

        
        start = time.process_time()
        mel_data['feature'] = mel_data['pathname'].apply(lambda x : mel_features_extractor(x, duration, resizeaudio=True).flatten())
        end = time.process_time()
        duree = end - start
        print(f'The time to extract the feature from mel spectrogram is : {duree} s' )
        

        featuredf = pd.DataFrame(np.array(mel_data.feature.tolist()))
        featuredf.rename(columns=lambda x : "mel"+str(x), inplace = True)

        df = mel_data.drop('feature', axis=1)
        df = pd.concat([df, featuredf], axis=1)

        #with open(mel_data_train_location_ftr, 'wb') as f:
        df.to_csv(mel_data_train_location_ftr);
            
            
        print('Done.')
        
        del featuredf
        del mel_data
        
    return df

In [10]:
# duration of audio file
duree_min = train_data['durations'].min()
duree_max = train_data['durations'].max()

Extraction des données de test qui nous permettront de réaliser notre programme

In [11]:
ext = 'csv'
df_mel_test = create_mel_data(test_data, 'mel_data_test', ext, duration=10)
df_mel_test.head()

mel_data_test.csv already exists, loading from file...
Done.


Unnamed: 0,pathname,machine_type,machine_kind,condition,mel0,mel1,mel2,mel3,mel4,mel5,...,mel5046,mel5047,mel5048,mel5049,mel5050,mel5051,mel5052,mel5053,mel5054,mel5055
0,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,ToyConveyor,real_machine,anomaly,-23.383516,-20.639809,-20.313389,-19.91984,-20.708096,-21.00571,...,-58.287483,-58.87516,-59.718616,-60.044518,-59.241707,-59.647648,-61.364803,-61.21647,-61.27778,-60.499603
1,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,ToyConveyor,real_machine,normal,-9.71612,-13.840788,-16.189808,-16.143745,-20.798363,-17.417233,...,-57.864384,-55.64717,-51.30796,-50.295303,-50.646072,-50.68862,-50.857357,-50.7922,-50.905914,-50.680363
2,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,ToyConveyor,real_machine,normal,-11.043976,-13.051968,-12.825278,-11.441542,-13.392862,-18.139688,...,-49.337036,-45.840664,-44.83549,-45.83506,-46.721954,-46.043938,-45.828293,-46.405354,-45.50654,-44.75505
3,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,ToyConveyor,real_machine,normal,-14.224063,-15.483646,-19.831749,-18.87388,-18.577896,-20.894018,...,-57.052177,-57.19272,-57.34806,-57.202305,-57.442368,-57.32332,-58.38926,-59.217964,-58.47119,-56.79356
4,/Users/jsmmvondo/Documents/GitHub/PySaDML/DCAS...,ToyConveyor,real_machine,anomaly,-12.548519,-16.282982,-21.708023,-19.428146,-16.607712,-17.457146,...,-59.692936,-58.468052,-58.695305,-60.51649,-60.010902,-58.574123,-58.851357,-61.584087,-61.107944,-60.508068


In [12]:
df_mel_test.describe()

Unnamed: 0,mel0,mel1,mel2,mel3,mel4,mel5,mel6,mel7,mel8,mel9,...,mel5046,mel5047,mel5048,mel5049,mel5050,mel5051,mel5052,mel5053,mel5054,mel5055
count,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,...,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0,10868.0
mean,-13.13106,-13.40655,-13.65975,-13.64646,-13.61384,-13.66445,-13.68814,-13.67498,-13.69462,-13.693406,...,-43.744841,-44.005329,-44.236835,-44.39122,-44.482684,-44.574196,-44.602526,-44.560735,-44.608898,-44.835362
std,7.170292,7.349426,7.5614,7.527538,7.474636,7.524399,7.555624,7.55745,7.604239,7.610246,...,10.999693,11.076692,11.188524,11.292147,11.34231,11.3508,11.361698,11.381956,11.394811,11.277655
min,-31.01499,-31.51073,-32.79271,-31.55895,-33.62399,-33.528,-35.00544,-34.5823,-35.64434,-35.523716,...,-72.05107,-72.44511,-72.86848,-72.54656,-72.727615,-74.31855,-75.32391,-73.95743,-74.390854,-75.59479
25%,-18.85645,-19.2964,-19.83202,-19.75784,-19.6722,-19.75815,-19.74243,-19.74755,-19.82113,-19.85523,...,-52.521459,-52.73802,-52.980767,-53.222355,-53.419433,-53.583654,-53.47777,-53.447161,-53.538555,-53.563472
50%,-14.24364,-14.8276,-15.1712,-15.14493,-15.188,-15.19805,-15.19311,-15.23716,-15.12927,-15.230897,...,-44.844811,-45.411369,-45.841839,-46.285503,-46.47543,-46.579081,-46.764496,-46.748531,-46.706007,-46.86737
75%,-6.405146,-6.034147,-5.967202,-5.910112,-5.981748,-5.979725,-6.010886,-5.93703,-5.985215,-5.884086,...,-35.039701,-35.124534,-35.030107,-35.01116,-35.125039,-35.176024,-35.214005,-35.095238,-35.05206,-35.523346
max,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,9.536743e-07,2e-06,...,-7.399979,-5.502842,-3.315913,-6.046356,-4.646518,-5.38528,-5.385519,-8.2392,-6.28241,-5.526649


# Classification ML avec des données de test

Numérisation des conditions 'normales' et 'anormales'

In [13]:
def condition(x):
    if x == 'anomaly':
        x = 0
    else:
        x = 1
    return x

In [14]:
condition('normal')

1

In [15]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel, f_regression, mutual_info_regression, RFE, RFECV
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso


feats = df_mel_test.drop(columns = ['condition', 'pathname', 'machine_type', 'machine_kind'])
target = df_mel_test['condition'].apply(lambda x: condition(x))
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size = .2)

In [16]:
X_train.head()

Unnamed: 0,mel0,mel1,mel2,mel3,mel4,mel5,mel6,mel7,mel8,mel9,...,mel5046,mel5047,mel5048,mel5049,mel5050,mel5051,mel5052,mel5053,mel5054,mel5055
6857,-26.509476,-26.652481,-24.955196,-24.559902,-24.480627,-25.674871,-23.561615,-21.357353,-22.122292,-25.616161,...,-45.3263,-45.41749,-46.444542,-46.654522,-46.53746,-46.058815,-45.323765,-46.150604,-46.626762,-46.828365
4917,-3.486188,-1.613215,-3.615394,-4.570935,-4.08082,-3.901645,-2.682524,-1.136232,-0.123943,-1.645732,...,-39.38711,-38.532494,-39.385765,-39.500977,-38.85629,-39.91114,-40.700462,-39.386,-35.33732,-35.608032
8156,-2.234361,-3.385105,-3.828234,-2.093819,-2.297787,-2.44388,-3.009644,-1.801927,-3.114199,-2.75272,...,-36.98696,-35.176178,-37.116554,-39.16086,-38.896446,-38.35045,-38.47246,-37.06395,-38.277336,-36.813034
9577,-4.006505,-4.873897,-6.088479,-5.35925,-4.941333,-5.228132,-3.978307,-4.154682,-4.009797,-3.504462,...,-31.813894,-32.116592,-32.42524,-32.300365,-32.27614,-33.089394,-33.15571,-33.826664,-34.772514,-34.793343
7038,-21.423805,-22.963032,-20.532854,-21.12465,-25.009516,-25.919796,-26.35846,-21.977173,-20.499184,-19.374653,...,-42.899284,-45.07972,-47.17914,-49.123375,-49.78086,-52.02762,-52.824432,-51.80477,-50.7089,-51.614265


## Réduction de features

### WRAPPER METHOD (RFE)

* Les méthodes de ce type sont très couteuses en calcul. 
* Ces méthodes sont donc à réserver à des problèmes de taille modeste.

### PCA Method

* PCA Method : pas besoin de la cible
* N'identifie que les corrélations linéaires entre variables

In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.9)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Nombre de composantes retenues :", pca.n_components_)

Nombre de composantes retenues : 29


### LDA Method

In [18]:
# LDA Method : nécessité d'utiliser les données étiquetées
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)
X_train_lda.shape
# Résultat LDA abérrant 
# elle n'est adaptée que dans le cas de problèmes de classification supervisée avec des features continues

(8694, 1)

Résultat LDA abérrant

## Classification

Pour la suite de nos tests, nous retiendrons la méthode PCA comme réduction des données lorsque cela le nécessitera car elle nous semble être la plus cohérente

### Classifier Random Forest

In [19]:
#Classifier sans réduction de features
from sklearn.ensemble import RandomForestClassifier

rfn = RandomForestClassifier()

start_time = time.time()

rfn.fit(X_train, y_train)
# Prédiction des données X_test

y_predn = rfn.predict(X_test)

end_time = time.time() - start_time

print(rfn.score(X_test, y_test))
print('Durée :', " %s seconds " % end_time)

0.8063477460901564
Durée :  46.84954285621643 seconds 


In [20]:

pd.crosstab(y_test, y_predn, rownames = ['classe réelle'], colnames = ['classe predites'])

classe predites,0,1
classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,939,191
1,230,814


In [21]:
print(metrics.classification_report(y_test, y_predn))

              precision    recall  f1-score   support

           0       0.80      0.83      0.82      1130
           1       0.81      0.78      0.79      1044

    accuracy                           0.81      2174
   macro avg       0.81      0.81      0.81      2174
weighted avg       0.81      0.81      0.81      2174



Utilisons maintenant la classification avec réduction des features pour un test plus rapide

In [140]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

start_time = time.time()

rf.fit(X_train_pca, y_train)

# Prédiction des données X_test_PCA

y_pred = rf.predict(X_test_pca)

end_time = time.time() - start_time

print("score :", rf.score(X_test_pca, y_test))
print('Durée : %s seconds' %end_time)

score : 0.7727690892364305
Durée : 2.9867210388183594 seconds


Nous constatons qu'avec des scores semblables, la prédiction avec les données réduites est 16 fois plus rapide qu'en temps normal

In [141]:

pd.crosstab(y_test, y_pred, rownames = ['classe réelle'], colnames = ['classe predites'])

classe predites,0,1
classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,888,242
1,252,792


In [142]:
# Fonction faisant ressortir clairement la condition normale ou anormale
def condition_r(x):
    result = []
    for i in x:
        if i == 0:
            i = 'anomaly'
            result.append(i)
        else:
            i = 'normal'
            result.append(i)
    return result

In [143]:
# Résultats des predictions en fonction des types de machine

set_test = pd.concat([df_mel_test.iloc[X_test.index].reset_index(), 
                     pd.Series(condition_r(y_pred)).rename('predictions')], axis=1)    

pd.crosstab([set_test['machine_type'], set_test['condition']], columns = set_test['predictions'])

Unnamed: 0_level_0,predictions,anomaly,normal
machine_type,condition,Unnamed: 2_level_1,Unnamed: 3_level_1
ToyCar,anomaly,195,28
ToyCar,normal,26,248
ToyConveyor,anomaly,69,171
ToyConveyor,normal,18,451
fan,anomaly,296,9
fan,normal,58,21
pump,anomaly,76,7
pump,normal,51,26
slider,anomaly,177,6
slider,normal,50,12


In [52]:
from sklearn import metrics
target_names = ['anomaly', 'normal']

print(metrics.classification_report(y_test, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     anomaly       0.78      0.79      0.79      1130
      normal       0.77      0.76      0.76      1044

    accuracy                           0.78      2174
   macro avg       0.78      0.77      0.77      2174
weighted avg       0.78      0.78      0.78      2174



### Classifier KNN

In [144]:
# Sans réduction de features

knn = neighbors.KNeighborsClassifier(n_neighbors=7, metric='minkowski')

start_time = time.time()
knn.fit(X_train, y_train)

# Prédiction des données X_test_PCA

y_pred = knn.predict(X_test)
end_time = time.time() - start_time

print("score :", knn.score(X_test, y_test))
print('Durée : %s seconds' %end_time)

score : 0.7580496780128795
Durée : 3.3550188541412354 seconds


In [145]:

knn = neighbors.KNeighborsClassifier(n_neighbors=7, metric='minkowski')

start_time = time.time()
knn.fit(X_train_pca, y_train)

# Prédiction des données X_test_PCA

y_pred = knn.predict(X_test_pca)
end_time = time.time() - start_time

print("score :", knn.score(X_test_pca, y_test))
print('Durée : %s seconds' %end_time)

score : 0.7263109475620975
Durée : 0.39591002464294434 seconds


In [146]:
pd.crosstab(y_test, y_pred, rownames = ['classe réelle'], colnames = ['classe predites'])

classe predites,0,1
classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,802,328
1,267,777


In [147]:
# Résultats des predictions en fonction des types de machine

set_test = pd.concat([df_mel_test.iloc[X_test.index].reset_index(), 
                     pd.Series(condition_r(y_pred)).rename('predictions')], axis=1)    

pd.crosstab([set_test['machine_type'], set_test['condition']], columns = set_test['predictions'])

Unnamed: 0_level_0,predictions,anomaly,normal
machine_type,condition,Unnamed: 2_level_1,Unnamed: 3_level_1
ToyCar,anomaly,162,61
ToyCar,normal,32,242
ToyConveyor,anomaly,69,171
ToyConveyor,normal,34,435
fan,anomaly,272,33
fan,normal,60,19
pump,anomaly,72,11
pump,normal,44,33
slider,anomaly,161,22
slider,normal,44,18


In [28]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73      1130
           1       0.70      0.74      0.72      1044

    accuracy                           0.73      2174
   macro avg       0.73      0.73      0.73      2174
weighted avg       0.73      0.73      0.73      2174



#### Par rapport à la méthode Random Forest, on a un score qui se dégrade nettement malgré le calcul plus rapide

### Arbre de décision

In [29]:
# Sans réduction de features

dt = DecisionTreeClassifier(criterion='entropy',max_depth=4)

start_time = time.time()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
end_time = time.time() - start_time

print("score :", dt.score(X_test, y_test))
print('Durée : %s seconds' %end_time)

score : 0.6747930082796688
Durée : 19.955955028533936 seconds


In [148]:
# Avec réduction de features
dt = DecisionTreeClassifier(criterion='entropy',max_depth=4)

start_time = time.time()
dt.fit(X_train_pca, y_train)
y_pred = dt.predict(X_test_pca)
end_time = time.time() - start_time

print("score :", dt.score(X_test_pca, y_test))
print('Durée : %s seconds' %end_time)

score : 0.6706531738730451
Durée : 0.1425790786743164 seconds


In [149]:
pd.crosstab(y_test, y_pred, rownames = ['classe réelle'], colnames = ['classe predites'])

classe predites,0,1
classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,851,279
1,437,607


In [150]:
# Résultats des predictions en fonction des types de machine

set_test = pd.concat([df_mel_test.iloc[X_test.index].reset_index(), 
                     pd.Series(condition_r(y_pred)).rename('predictions')], axis=1)    

pd.crosstab([set_test['machine_type'], set_test['condition']], columns = set_test['predictions'])

Unnamed: 0_level_0,predictions,anomaly,normal
machine_type,condition,Unnamed: 2_level_1,Unnamed: 3_level_1
ToyCar,anomaly,156,67
ToyCar,normal,98,176
ToyConveyor,anomaly,83,157
ToyConveyor,normal,81,388
fan,anomaly,288,17
fan,normal,69,10
pump,anomaly,77,6
pump,normal,73,4
slider,anomaly,168,15
slider,normal,48,14


In [32]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.75      0.70      1130
           1       0.69      0.58      0.63      1044

    accuracy                           0.67      2174
   macro avg       0.67      0.67      0.67      2174
weighted avg       0.67      0.67      0.67      2174



#### Même constat que précédemment , on a un score qui se dégrade nettement malgré le calcul plus rapide

### Support vector Machine SVM

Pour cette méthode, il est nécessaire d'avoir des données à la même échelle donc standardisées

In [151]:
# Standardisation des données

scaler = StandardScaler()
X_train_scal = scaler.fit_transform(X_train_pca)
X_test_scal = scaler.transform(X_test_pca)

# Modèle SVM

svc = svm.SVC(kernel='poly', gamma=0.01)

start_time = time.time()

svc.fit(X_train_scal, y_train)
y_pred = svc.predict(X_test_scal)
end_time = time.time() - start_time


print(svc.score(X_test_scal, y_test))
print('Durée : %s seconds' %end_time)


0.5611775528978841
Durée : 4.88338303565979 seconds


In [152]:
pd.crosstab(y_test, y_pred, rownames = ['classe réelle'], colnames = ['classe predites'])

classe predites,0,1
classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,263,867
1,87,957


In [153]:
# Résultats des predictions en fonction des types de machine

set_test = pd.concat([df_mel_test.iloc[X_test.index].reset_index(), 
                     pd.Series(condition_r(y_pred)).rename('predictions')], axis=1)    

pd.crosstab([set_test['machine_type'], set_test['condition']], columns = set_test['predictions'])

Unnamed: 0_level_0,predictions,anomaly,normal
machine_type,condition,Unnamed: 2_level_1,Unnamed: 3_level_1
ToyCar,anomaly,7,216
ToyCar,normal,1,273
ToyConveyor,anomaly,0,240
ToyConveyor,normal,0,469
fan,anomaly,88,217
fan,normal,20,59
pump,anomaly,16,67
pump,normal,15,62
slider,anomaly,109,74
slider,normal,20,42


In [154]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.23      0.36      1130
           1       0.52      0.92      0.67      1044

    accuracy                           0.56      2174
   macro avg       0.64      0.57      0.51      2174
weighted avg       0.64      0.56      0.51      2174

