## Connexion au Drive et importation des librairies

In [1]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
drive.mount('/content/drive')

os.chdir("/content/drive/MyDrive/EPM/H2021/INF8225/Projet")
print(f"Répertoire de travail: {os.getcwd()}")

Mounted at /content/drive
Répertoire de travail: /content/drive/MyDrive/EPM/H2021/INF8225/Projet


## Importation et traitement des données

In [3]:
def get_dataset_all_files(train=True):
    if train:
        dossier = "data/second_samples/normalized/train/"
    else:
        dossier = "data/normalized/test/"

    files = [f for f in os.listdir(dossier) if f.startswith("labels")]
    path = os.path.join(dossier, files[0])
    df = pd.read_csv(path, sep=',')
    df.drop(['node_number', 'parent_node_number', 'value'], axis=1, inplace=True)

    for i,f in enumerate(files):
        path = os.path.join(dossier, f)

        if i==0:
            pass
        else:
            df_to_concat = pd.read_csv(path, sep=',')
            df_to_concat.drop(['node_number', 'parent_node_number', 'value'], axis=1, inplace=True)
            df = pd.concat([df, df_to_concat], ignore_index=True)

    print(f"{dossier}: all datasets have been concatenated !")

    return df


In [4]:
def data_loader_whole_dataset(df, shuffle, batch_size):

    data = df.values.tolist()
    input_tensor = torch.tensor(data, dtype=torch.float32)

    print(f"input size : {input_tensor.size()}\n")

    data_loader = DataLoader(input_tensor, batch_size=batch_size, shuffle=shuffle, num_workers=2)

    return data_loader

In [5]:
df_train = get_dataset_all_files()

for column in df_train.columns:
    mean = df_train[column].mean()
    df_train[column].fillna(mean, inplace=True)

print(df_train.isnull().sum())
df_train.describe()

data/second_samples/normalized/train/: all datasets have been concatenated !
var_cost                                       0
frac_val                                       0
fraction_conflicting_columns                   0
fraction_conflicting_columns_positive_value    0
min_cost_conflicting_column                    0
min_cost_conflicting_column_positive_value     0
number_cols_in_mp                              0
dual_cost_min                                  0
dual_cost_max                                  0
dual_cost_avg                                  0
frac_pairing_tasks_fixed                       0
nb_pairing_tasks                               0
dtype: int64


Unnamed: 0,var_cost,frac_val,fraction_conflicting_columns,fraction_conflicting_columns_positive_value,min_cost_conflicting_column,min_cost_conflicting_column_positive_value,number_cols_in_mp,dual_cost_min,dual_cost_max,dual_cost_avg,frac_pairing_tasks_fixed,nb_pairing_tasks
count,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0,139353.0
mean,4.6152730000000003e-17,0.74199,0.054628,0.001193,-1.379692,-0.702695,7.921412000000001e-17,-4.4479670000000005e-18,5.597602e-18,2.2435020000000002e-18,0.499396,-4.3827970000000004e-17
std,0.9968811,0.137527,0.056038,0.001731,0.428385,0.706835,0.9985652,0.8847659,0.8896963,0.874095,0.260543,0.726375
min,-2.755592,0.00062,6.1e-05,3.3e-05,-2.76064,-2.76064,-3.106548,-1.788854,-1.78885,-1.770607,0.022964,-1.93345
25%,-0.7412193,0.659396,0.018507,0.000327,-1.650335,-1.190492,-0.84879,-0.6823491,-0.5973203,-0.6953152,0.283093,-0.5197221
50%,0.006249577,0.742903,0.036331,0.000681,-1.402961,-0.733364,-0.1071322,0.1243279,-0.2891524,-0.09519165,0.49118,0.004508753
75%,0.665751,0.837794,0.069977,0.001353,-1.078287,-0.203932,0.7846468,0.6053587,0.7027876,0.6883701,0.721361,0.514205
max,5.603851,0.999987,0.763158,0.050847,2.260579,4.166964,6.662968,1.78885,1.788852,1.783217,0.995704,2.891937


In [None]:
"""
df_test = get_dataset_all_files(train=False)

for column in df_test.columns:
    mean = df_test[column].mean()
    df_test[column].fillna(mean, inplace=True)

print(df_test.isnull().sum())
df_test.describe()
"""

## Autoencoder

In [14]:
class Autoencoder(nn.Module):

    def __init__(self, final_dim, n_features=len(df_train.columns)):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(in_features=n_features, out_features=8)
        self.encoder_output_layer = nn.Linear(in_features=8, out_features=final_dim)
        self.decoder_hidden_layer = nn.Linear(in_features=final_dim, out_features=8)
        self.decoder_output_layer = nn.Linear(in_features=8, out_features=n_features)
        self.selu = nn.SELU()

    def forward(self, input, train=True):
        activation = self.encoder_hidden_layer(input)
        activation = torch.selu(activation)

        code = self.encoder_output_layer(activation)
        code = torch.selu(code)

        if train == False:
            return code

        activation = self.decoder_hidden_layer(code)
        activation = torch.selu(activation)
        activation = self.decoder_output_layer(activation)
        
        reconstructed = torch.tanh(activation)

        return reconstructed


In [15]:
n_epochs = 50
final_dim = 6
lr = 5e-4
batch_size = 32

In [16]:
print("Train set:")
train_loader = data_loader_whole_dataset(df_train, shuffle=True, batch_size=batch_size)

"""
print("Test set:")
test_loader = data_loader_whole_dataset(df_test, shuffle=True, batch_size=batch_size)
"""

Train set:
input size : torch.Size([139353, 12])



'\nprint("Test set:")\ntest_loader = data_loader_whole_dataset(df_test, shuffle=True, batch_size=batch_size)\n'

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Autoencoder(final_dim=final_dim).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)

criterion = nn.MSELoss()

### Entraînement du modèle

In [18]:
for epoch in range(1, n_epochs+1):
    loss = 0
    start = time.time()

    for input in train_loader:
      input = input.to(device)
      optimizer.zero_grad()
      
      output = model(input)

      train_loss = criterion(output, input)
      train_loss.backward()
      optimizer.step()
      loss += train_loss.item()

    loss = loss/len(train_loader)

    if epoch % 5 == 0:
        print(f"epoch {epoch+1}/{n_epochs} ({time.time() - start:.3f}sec), loss = {loss:.6f}")

epoch 6/50 (11.253sec), loss = 0.090758
epoch 11/50 (11.238sec), loss = 0.089474
epoch 16/50 (11.156sec), loss = 0.088736
epoch 21/50 (11.102sec), loss = 0.088144
epoch 26/50 (11.067sec), loss = 0.087934
epoch 31/50 (11.385sec), loss = 0.087766
epoch 36/50 (11.041sec), loss = 0.087620
epoch 41/50 (11.184sec), loss = 0.087502
epoch 46/50 (11.176sec), loss = 0.087391
epoch 51/50 (11.049sec), loss = 0.087241


### Test et sauvegarde du modèle

In [None]:
"""
loss = 0

start = time.time()
for input in test_loader:
  input = input.to(device)
  
  output = model(input)

  train_loss = criterion(output, input)
  loss += train_loss.item()

loss = loss/len(train_loader)

print(f"Test ({time.time() - start:.3f}sec), loss = {loss:.6f}")
"""

In [19]:
# Enregistrement du modèl entraîné

path = 'autoencoder.pt'
torch.save(model.state_dict(), path)

## Récupération de la sortie de l'autoecoder

In [None]:
# Chargement du modèle entraîné
model = Autoencoder(final_dim).to(device)
model.load_state_dict(torch.load('autoencoder.pt'))
model.eval()

In [None]:
def dimmension_reduction(path, train=True):
    df_enc = pd.read_csv(path, sep=',')
    input_enc = torch.tensor(df_enc.value.to_list(), dtype=torch.float32).to(device)

    if train:
        output = model(input, train=True)
    else:
        output = model(input, train=False)

    df_out = pd.DataFrame(output.tolist())

    return df_out

In [None]:
file = 'data/normalized/train/labels_NW_727_test_1_1_win_0'

print("Dataset original\n")
df_ex = pd.read_csv(file, sep=',')
df_ex.drop(['node_number', 'parent_node_number', 'value'], axis=1, inplace=True)
df_ex.head(10)

Dataset original



Unnamed: 0,var_cost,frac_val,fraction_conflicting_columns,fraction_conflicting_columns_positive_value,min_cost_conflicting_column,min_cost_conflicting_column_positive_value,number_cols_in_mp,dual_cost_min,dual_cost_max,dual_cost_avg,frac_pairing_tasks_fixed,nb_pairing_tasks
0,-1.030902,0.995628,0.061187,0.001854,-1.032505,-1.032505,1.333778,-0.932997,-0.273465,-0.89395,0.806515,-0.600622
1,-1.440795,0.747977,0.026576,0.001854,-1.443036,-1.443036,1.333778,0.283354,-0.236493,-0.135318,0.806515,-1.099601
2,-1.473401,0.630383,0.043263,0.002472,-1.475692,-1.475692,1.333778,1.597928,-1.305377,0.391381,0.806515,-1.099601
3,-0.038773,0.630383,0.103214,0.001854,-1.250211,-0.979633,1.333778,-0.469141,1.387026,1.479016,0.806515,-0.101644
4,-1.234296,0.568715,0.072312,0.003708,-1.480357,-1.360619,1.333778,-0.479145,0.428309,-0.841129,0.806515,-0.600622
5,-1.440795,0.771597,0.062706,0.0033,-1.443036,-1.443036,-0.902608,-0.140135,0.032802,-1.282456,0.810464,-1.099601
6,-1.473401,0.611126,0.052805,0.006601,-1.475692,-1.475692,-0.902608,1.750018,-1.537373,-0.531518,0.810464,-1.099601
7,-0.038773,0.611126,0.082508,0.009901,-1.250211,-0.979633,-0.902608,-0.473494,1.00281,0.976775,0.810464,-0.101644
8,0.917646,0.575589,0.110561,0.008251,-1.250211,-0.503791,-0.902608,-0.572259,0.723646,-0.121653,0.810464,0.896313
9,1.514895,0.558639,0.176568,0.016502,-1.443036,-1.443036,-0.902608,-0.56413,-0.221885,0.958853,0.810464,0.646824


In [None]:
print("Dataset reconstruit par l'autoencoder\n")
df_ex_reconstructed = dimmension_reduction(file, train=True)
df_ex_reconstructed.head(10)

Dataset reconstruit par l'autoencoder



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.856554,0.846129,0.037386,0.000299,-0.999998,-0.970956,0.998631,-0.958487,0.986236,0.994004,0.396969,0.306146
1,-0.999238,0.669945,0.054126,-0.000341,-1.0,-0.999537,-0.948367,0.99327,-0.972878,-0.958767,0.561759,-0.971982
2,0.7281,0.691775,0.086974,0.000793,-1.0,-0.88645,-0.764936,0.679688,-0.7278,0.979603,0.551545,0.09494
3,0.961344,0.684867,0.1013,0.000979,-0.999949,0.465964,-0.711195,0.479372,-0.479887,-0.003654,0.537393,0.895851
4,-0.600701,0.6998,0.063862,0.000399,-0.999996,-0.96717,-0.78022,0.498036,0.817911,0.445515,0.537026,-0.493002
5,-0.30415,0.870516,0.019428,-0.000422,-0.999779,-0.706844,0.999667,0.458006,-0.442188,-0.948304,0.372862,0.495393
6,-0.999716,0.849939,-0.007131,-0.00133,-1.0,-0.999766,0.997119,0.881686,-0.988588,-0.987289,0.414,-0.969804
7,0.316772,0.869496,0.019341,-0.000355,-0.999792,-0.332182,0.999567,0.649234,-0.733489,0.221352,0.378319,0.347951
8,0.915202,0.66181,0.094857,0.000917,-0.999978,0.38515,-0.934084,-0.799755,0.56486,0.956646,0.56161,0.528594
9,-0.699708,0.735986,0.068624,8.2e-05,-0.999996,-0.789265,-0.091743,-0.988232,0.161261,-0.446246,0.519817,-0.320055


In [None]:
print("Sortie de l'autoencoder sur le dataset\n")
df_ex_encoder = dimmension_reduction(file, train=False)
df_ex_encoder.head()

Sortie de l'autoencoder sur le dataset



Unnamed: 0,0,1,2,3,4,5
0,0.456379,-0.587782,-0.257624,-0.139295,0.635718,-0.542595
1,-0.463726,0.773509,0.579686,-0.421352,-0.205366,-0.176476
2,0.349477,0.534811,0.406434,-0.085701,0.546622,-0.308439
3,0.542759,0.497811,0.315521,0.140885,0.145491,-0.367145
4,0.098002,0.568902,-0.05109,-0.158279,0.216314,-0.310949
