<a href="https://colab.research.google.com/github/BlackCat1606/ProjetPseudo/blob/master/pseudo_label_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install MulticoreTSNE
%matplotlib inline
from MulticoreTSNE import MulticoreTSNE as TSNE
from matplotlib import pyplot as plt
import torch
from torchvision import datasets, transforms
from torch import nn
import torch.nn.functional as F
import numpy as np

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd 
train_data = "./gdrive/My Drive/ProjetPseudo/data/mnist_train.csv"
test_data = "./gdrive/My Drive/ProjetPseudo/data/mnist_test.csv"
UNLABELED_BS = 256
TRAIN_BS = 32
TEST_BS = 1024

num_train_samples = 1000
samples_per_class = int(num_train_samples/9)

x = pd.read_csv(train_data)
y = x['label']
x.drop(['label'], inplace = True, axis = 1)

x_test = pd.read_csv(test_data)
y_test = x_test['label']
x_test.drop(['label'], inplace = True, axis = 1)


Maintenant, on va diviser l'ensemble des données en ensembles étiquetée et ensembles non étiquetés. Pour l'ensemble entrainé, nous nous assurerons d'avoir des échantillons égaux pour les 10 classes. (Équilibrage des classes)


In [0]:
x_train, x_unlabeled = x[y.values == 0].values[:samples_per_class], x[y.values == 0].values[samples_per_class:]
y_train = y[y.values == 0].values[:samples_per_class]

for i in range(1,10):
    x_train = np.concatenate([x_train, x[y.values == i].values[:samples_per_class]], axis = 0)
    y_train = np.concatenate([y_train, y[y.values == i].values[:samples_per_class]], axis = 0)
    
    x_unlabeled = np.concatenate([x_unlabeled, x[y.values == i].values[samples_per_class:]], axis = 0)

Ensuite, nous normaliserons les données, les convertirons en tensors et créerons les chargeurs de données pour les ensembles entrainés, sans étiquette et de test.

In [0]:

from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
x_train = normalizer.fit_transform(x_train)
x_unlabeled = normalizer.transform(x_unlabeled)
x_test = normalizer.transform(x_test.values)

In [0]:
x_train = torch.from_numpy(x_train).type(torch.FloatTensor)
y_train = torch.from_numpy(y_train).type(torch.LongTensor) 

x_test = torch.from_numpy(x_test).type(torch.FloatTensor)
y_test = torch.from_numpy(y_test.values).type(torch.LongTensor)

In [0]:

train = torch.utils.data.TensorDataset(x_train, y_train)
test = torch.utils.data.TensorDataset(x_test, y_test)

train_loader = torch.utils.data.DataLoader(train, batch_size = TRAIN_BS, shuffle = True, num_workers = 8)

unlabeled_train = torch.from_numpy(x_unlabeled).type(torch.FloatTensor)

unlabeled = torch.utils.data.TensorDataset(unlabeled_train)
unlabeled_loader = torch.utils.data.DataLoader(unlabeled, batch_size = UNLABELED_BS, shuffle = True, num_workers = 8)

test_loader = torch.utils.data.DataLoader(test, batch_size = TEST_BS, shuffle = True, num_workers = 8)

### Architecture de réseau

Nous utiliserons un simple réseau de couche Conv + 2 FC à 2 couches avec le dropout.


In [0]:
# Architecture  : https://github.com/peimengsui/semi_supervised_mnist
class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, kernel_size=5)
            self.conv2 = nn.Conv2d(20, 40, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(640, 150)
            self.fc2 = nn.Linear(150, 10)
            self.log_softmax = nn.LogSoftmax(dim = 1)

        def forward(self, x):
            x = x.view(-1,1,28,28)
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 640)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = F.relu(self.fc2(x))
            x = self.log_softmax(x)
            return x
        
net = Net().cuda()

Définissons maintenant une fonction pour évaluer le réseau et obtenir des valeurs de perte et de précision.

In [0]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0 
    loss = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data = data.cuda()
            output = model(data)
            predicted = torch.max(output,1)[1]
            correct += (predicted == labels.cuda()).sum()
            loss += F.nll_loss(output, labels.cuda()).item()

    return (float(correct)/len(test)) *100, (loss/len(test_loader))


Tout d'abord, entraînons le modèle sur l'ensemble étiqueté pour 100 époques


In [0]:
from tqdm import tqdm_notebook
def train_supervised(model, train_loader, test_loader):
    optimizer = torch.optim.SGD( model.parameters(), lr = 0.1)
    EPOCHS = 100
    model.train()
    for epoch in tqdm_notebook(range(EPOCHS)):
        correct = 0
        running_loss = 0
        for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.cuda(), y_batch.cuda()
            
            output = model(X_batch)
            labeled_loss = F.nll_loss(output, y_batch)
                       
            optimizer.zero_grad()
            labeled_loss.backward()
            optimizer.step()
            running_loss += labeled_loss.item()
        
        if epoch %10 == 0:
            test_acc, test_loss = evaluate(model, test_loader)
            print('Epoch: {} : Train Loss : {:.5f} | Test Acc : {:.5f} | Test Loss : {:.3f} '.format(epoch, running_loss/(10 * len(train)), test_acc, test_loss))
            model.train()

In [0]:
train_supervised(net, train_loader, test_loader)

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch: 0 : Train Loss : 0.00726 | Test Acc : 12.74000 | Test Loss : 2.302 
Epoch: 10 : Train Loss : 0.00725 | Test Acc : 29.36000 | Test Loss : 2.296 
Epoch: 20 : Train Loss : 0.00534 | Test Acc : 54.52000 | Test Loss : 1.459 
Epoch: 30 : Train Loss : 0.00153 | Test Acc : 88.05000 | Test Loss : 0.406 
Epoch: 40 : Train Loss : 0.00100 | Test Acc : 92.33000 | Test Loss : 0.246 
Epoch: 50 : Train Loss : 0.00076 | Test Acc : 93.85000 | Test Loss : 0.208 
Epoch: 60 : Train Loss : 0.00062 | Test Acc : 93.89000 | Test Loss : 0.194 
Epoch: 70 : Train Loss : 0.00053 | Test Acc : 94.40000 | Test Loss : 0.180 
Epoch: 80 : Train Loss : 0.00036 | Test Acc : 94.60000 | Test Loss : 0.183 
Epoch: 90 : Train Loss : 0.00032 | Test Acc : 95.04000 | Test Loss : 0.181 



In [0]:
test_acc, test_loss = evaluate(net, test_loader)
print('Test Acc : {:.5f} | Test Loss : {:.3f} '.format(test_acc, test_loss))
torch.save(net.state_dict(), 'saved_models/supervised_weights')

Test Acc : 94.95000 | Test Loss : 0.182 


In [0]:
net.load_state_dict(torch.load('saved_models/supervised_weights'))

<All keys matched successfully>

In [0]:
T1 = 100
T2 = 700
af = 3

def alpha_weight(epoch):
    if epoch < T1:
        return 0.0
    elif epoch > T2:
        return af
    else:
         return ((epoch-T1) / (T2-T1))*af

In [0]:
# Concept : https://github.com/peimengsui/semi_supervised_mnist

from tqdm import tqdm_notebook

acc_scores = []
unlabel = []
pseudo_label = []

alpha_log = []
test_acc_log = []
test_loss_log = []
def semisup_train(model, train_loader, unlabeled_loader, test_loader):
    optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
    EPOCHS = 150
    
    # Au lieu d'utiliser l'époque actuelle, nous utilisons une variable "step" pour calculer l'alpha_weight
    # Cela permet au modèle de converger plus rapidement

    step = 100 
    
    model.train()
    for epoch in tqdm_notebook(range(EPOCHS)):
        for batch_idx, x_unlabeled in enumerate(unlabeled_loader):
            
            
            # Pass avant pour obtenir les pseudo étiquettes
            x_unlabeled = x_unlabeled[0].cuda()
            model.eval()
            output_unlabeled = model(x_unlabeled)
            _, pseudo_labeled = torch.max(output_unlabeled, 1)
            model.train()
            
            
            """ UNIQUEMENT POUR LA VISUALISATION"""
            if (batch_idx < 3) and (epoch % 10 == 0):
                unlabel.append(x_unlabeled.cpu())
                pseudo_label.append(pseudo_labeled.cpu())
            """ ********************** """
            
            # Maintenant, calculez la perte sans étiquette en utilisant le pseudo label
            output = model(x_unlabeled)
            unlabeled_loss = alpha_weight(step) * F.nll_loss(output, pseudo_labeled)   
            
            # Propagation arrière
            optimizer.zero_grad()
            unlabeled_loss.backward()
            optimizer.step()
            
            
            # Pour 50 lots, entraînez une époque sur des données étiquetées
            if batch_idx % 50 == 0:
                
                # Procédure d'entraînement normale
                for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
                    X_batch = X_batch.cuda()
                    y_batch = y_batch.cuda()
                    output = model(X_batch)
                    labeled_loss = F.nll_loss(output, y_batch)

                    optimizer.zero_grad()
                    labeled_loss.backward()
                    optimizer.step()
                                   
                # Maintenant, nous incrémentons l'étape de 1
                step += 1
                

        test_acc, test_loss =evaluate(model, test_loader)
        print('Epoch: {} : Alpha Weight : {:.5f} | Test Acc : {:.5f} | Test Loss : {:.3f} '.format(epoch, alpha_weight(step), test_acc, test_loss))
        
        """ LOGGING VALUES """
        alpha_log.append(alpha_weight(step))
        test_acc_log.append(test_acc/100)
        test_loss_log.append(test_loss)
        """ ************** """
        model.train()
        

In [0]:
semisup_train(net, train_loader, unlabeled_loader, test_loader)

HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

Epoch: 0 : Alpha Weight : 0.02500 | Test Acc : 94.78000 | Test Loss : 0.185 
Epoch: 1 : Alpha Weight : 0.05000 | Test Acc : 95.08000 | Test Loss : 0.178 
Epoch: 2 : Alpha Weight : 0.07500 | Test Acc : 94.90000 | Test Loss : 0.190 
Epoch: 3 : Alpha Weight : 0.10000 | Test Acc : 95.32000 | Test Loss : 0.185 
Epoch: 4 : Alpha Weight : 0.12500 | Test Acc : 94.88000 | Test Loss : 0.194 
Epoch: 5 : Alpha Weight : 0.15000 | Test Acc : 95.11000 | Test Loss : 0.187 
Epoch: 6 : Alpha Weight : 0.17500 | Test Acc : 95.57000 | Test Loss : 0.173 
Epoch: 7 : Alpha Weight : 0.20000 | Test Acc : 95.61000 | Test Loss : 0.180 
Epoch: 8 : Alpha Weight : 0.22500 | Test Acc : 95.71000 | Test Loss : 0.172 
Epoch: 9 : Alpha Weight : 0.25000 | Test Acc : 95.84000 | Test Loss : 0.162 
Epoch: 10 : Alpha Weight : 0.27500 | Test Acc : 95.96000 | Test Loss : 0.161 
Epoch: 11 : Alpha Weight : 0.30000 | Test Acc : 95.95000 | Test Loss : 0.158 
Epoch: 12 : Alpha Weight : 0.32500 | Test Acc : 95.78000 | Test Loss : 0.1

In [0]:
test_acc, test_loss = evaluate(net, test_loader)
print('Test Acc : {:.5f} | Test Loss : {:.3f} '.format(test_acc, test_loss))
torch.save(net.state_dict(), 'saved_models/semi_supervised_weights')

Test Acc : 98.31000 | Test Loss : 0.072 


## Visualizations

In [0]:
print(len(unlabel))
unlabe = np.concatenate([u.cpu().numpy() for u in unlabel])
pseudo_labe = np.concatenate([u.cpu().numpy() for u in pseudo_label])

45


In [0]:
!pip install wurlitzer
%load_ext wurlitzer

Collecting wurlitzer
  Downloading https://files.pythonhosted.org/packages/24/5e/f3bd8443bfdf96d2f5d10097d301076a9eb55637b7864e52d2d1a4d8c72a/wurlitzer-2.0.0-py2.py3-none-any.whl
Installing collected packages: wurlitzer
Successfully installed wurlitzer-2.0.0


In [0]:

train_data = "./gdrive/My Drive/ProjetPseudo/data/mnist_train.csv"
x = pd.read_csv(train_data)
y = x['label']
x.drop(['label'], inplace = True, axis = 1)

x = normalizer.transform(x.values)

tsne_x0 = np.concatenate([x, x_train,unlabel[0]])
print(1)
tsne_x1 = np.concatenate([x, x_train,unlabel[1]])
tsne_x2 = np.concatenate([x, x_train,unlabel[2]])
tsne_x = np.concatenate([tsne_x0, tsne_x1,tsne_x2])
tsne_y0 = np.concatenate([y.values, y_train, pseudo_label[0]])
tsne_y1 = np.concatenate([y.values, y_train, pseudo_label[1]])
tsne_y2 = np.concatenate([y.values, y_train, pseudo_label[2]])
tsne_y = np.concatenate([tsne_y0, tsne_y1,tsne_y2])
print(2)



embeddings = TSNE(perplexity = 30, n_jobs=8, verbose = 1, n_iter = 100).fit_transform(tsne_x)

1


tcmalloc: large alloc 1154670592 bytes == 0x145a06000 @  0x7fa92a9261e7 0x7fa9228e7f71 0x7fa92294b55d 0x7fa92294b733 0x7fa9229e9768 0x7fa9229e9fc4 0x7fa9229ea112 0x5678b3 0x5a067e 0x7fa92293706d 0x50a8af 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50c5b9 0x508245 0x516915 0x50a8af 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x509642 0x595311


2


Performing t-SNE using 8 cores.
Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
Computing input similarities...
Building tree...
 - point 18409 of 184098
 - point 36818 of 184098
 - point 55227 of 184098
 - point 73636 of 184098
 - point 92045 of 184098
 - point 110454 of 184098
 - point 128863 of 184098
 - point 147272 of 184098
 - point 165681 of 184098
 - point 184090 of 184098
Done in 1494.00 seconds (sparsity = 0.000743)!
Learning embedding...
Iteration 51: error is 130.563781 (50 iterations in 56.00 seconds)
Iteration 100: error is 131.172893 (50 iterations in 59.00 seconds)
Fitting performed in 115.00 seconds.


In [0]:
from tqdm import tqdm_notebook
%matplotlib
plt.figure(figsize=(15,10))
step_size = UNLABELED_BS * 3
base_index = x.shape[0]
epoch = 0
for i in tqdm_notebook(range(0,np.array([t.numpy() for t in unlabel]).shape[0], step_size)):
    plt.scatter(embeddings[:base_index, 0], embeddings[:base_index, 1], c=tsne_y[:base_index], cmap=plt.cm.get_cmap("jet", 10), marker='s', alpha = 0.002, s = 14**2)
    a = base_index
    b = base_index + num_train_samples
    plt.scatter(embeddings[a:b, 0], embeddings[a:b, 1], c=tsne_y[a:b], cmap=plt.cm.get_cmap("jet", 10), marker='o', alpha = 0.3, s = 90**1)
    a = base_index + num_train_samples + i
    b = base_index + num_train_samples + i + step_size
    plt.scatter(embeddings[a:b, 0], embeddings[a:b, 1], c=tsne_y[a:b], cmap=plt.cm.get_cmap("jet", 10), marker='*', s = 150**1)
    plt.colorbar(ticks=range(10))
    plt.clim(-0.5, 9.5)
    plt.title('Epoch : ' + str(epoch) +'  Test Acc : {:.2f}%'.format(test_acc_log[epoch]*100), fontsize = 20)
    plt.savefig('imgs/tsne' + str(i) + '.png')
    plt.draw()
    plt.pause(5)
    plt.clf()
    epoch += 10


Using matplotlib backend: agg


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


