## Preparing imdb.json

In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time
import psutil
import os
import gc

In [2]:
with open('/content/drive/MyDrive/Colab Notebooks/KM3/imdb_dataset_prepared.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [3]:
# Wyciągnięcie danych
X_train = np.array(data['X_train'])
X_test = np.array(data['X_test'])
y_train = np.array(data['y_train'])
y_test = np.array(data['y_test'])
embeddings = np.array(data['embeddings'])
vocab = data['vocab']

# Dodanie jednego pustego elementu, by dane treningowe i testowe sie zgadzaly
vocab.insert(0, '')
embeddings = np.insert(embeddings, 0, np.zeros(50), axis=0) # Update embeddings here

vocab_size = len(vocab)
embedding_dim = embeddings.shape[1]
sequence_length = X_train.shape[1]

In [4]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Vocab size: {vocab_size}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings dim: {embedding_dim}")
print(f"Sequence length: {sequence_length}")

X_train shape: (40000, 130)
X_test shape: (10000, 130)
y_train shape: (40000,)
y_test shape: (10000,)
Vocab size: 12850
Embeddings shape: (12850, 50)
Embeddings dim: 50
Sequence length: 130


## Konwersja danych do tensorów PyTorch i DataLoader

In [5]:
train_texts_tensor = torch.tensor(X_train, dtype=torch.long)
train_label_tensor = torch.tensor(y_train, dtype=torch.float)

test_texts_tensor = torch.tensor(X_test, dtype=torch.long)
test_label_tensor = torch.tensor(y_test, dtype=torch.float)

train_dataset = TensorDataset(train_texts_tensor, train_label_tensor)
test_dataset = TensorDataset(test_texts_tensor, test_label_tensor)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [6]:
print(f"train_texts_tensor shape: {train_texts_tensor.shape}")
print(f"train_label_tensor shape: {train_label_tensor.shape}")
print(f"test_texts_tensor shape: {test_texts_tensor.shape}")
print(f"test_label_tensor shape: {test_label_tensor.shape}")

print(f"Liczba batchy w zbiorze treningowym: {len(train_dataloader)}")
print(f"Liczba batchy w zbiorze testowym: {len(test_dataloader)}")

train_texts_tensor shape: torch.Size([40000, 130])
train_label_tensor shape: torch.Size([40000])
test_texts_tensor shape: torch.Size([10000, 130])
test_label_tensor shape: torch.Size([10000])
Liczba batchy w zbiorze treningowym: 625
Liczba batchy w zbiorze testowym: 157


## Tworzenie sieci CNN

In [7]:
class TextCNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, glove_embeddings, max_sequence_length):
    super(TextCNN, self).__init__()

    # 1. embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.load_state_dict({'weight': torch.from_numpy(embeddings)})

    # 2. conv layer
    self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=8, kernel_size=3)

    # 3. maxpool layer
    self.maxpool1d = nn.MaxPool1d(kernel_size = 8)

    # 4. dense layer
    self.fc_input_features = 8 * ((sequence_length - self.conv1d.kernel_size[0] + 1 - self.maxpool1d.kernel_size) // self.maxpool1d.kernel_size + 1)
    expected_dense_input = 128
    if self.fc_input_features != expected_dense_input:
      raise ValueError(f"Expected dense input to be {expected_dense_input}, but got {self.fc_input_features}")

    self.fc = nn.Linear(self.fc_input_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.embedding(x)
    x = x.permute(0, 2, 1)
    x = F.relu(self.conv1d(x))
    x = self.maxpool1d(x)
    x = torch.flatten(x, 1)
    x = self.fc(x)
    x = self.sigmoid(x)
    return x


model = TextCNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    glove_embeddings=embeddings,
    max_sequence_length=sequence_length
)

In [8]:
print(model)

TextCNN(
  (embedding): Embedding(12850, 50)
  (conv1d): Conv1d(50, 8, kernel_size=(3,), stride=(1,))
  (maxpool1d): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


## Trening CNN

In [10]:
device = torch.device("cpu")
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 5

def get_memory_usage_gb():
    """Zwraca aktualne zużycie pamięci RAM w GiB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024 / 1024  # GiB

model.train()
for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")

    # Pomiar czasu epoki i GC
    epoch_start_time = time.time()
    gc_start_time = time.time()

    total_loss = 0.0

    for batch_idx, (texts, labels) in enumerate(train_dataloader):
        texts = texts.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs.squeeze(1), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Wymuszenie garbage collection i pomiar czasu GC
    gc.collect()
    gc_end_time = time.time()
    gc_time = gc_end_time - gc_start_time

    # Obliczenie metryk epoki
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time
    total_memory_gb = get_memory_usage_gb()
    avg_loss = total_loss / len(train_dataloader)

    # Wyświetlenie wyników w żądanym formacie
    print(f"Epoch: {epoch+1} \tTrain: (l: {avg_loss:.4f}) \tTotal Epoch Time: {epoch_duration:.4f}s \tTotal Alloc: {total_memory_gb:.3f} GiB \tGC Time: {gc_time:.4f}s")

Epoch: 1
Epoch: 1 	Train: (l: 0.4114) 	Total Epoch Time: 9.8159s 	Total Alloc: 0.961 GiB 	GC Time: 9.8159s
Epoch: 2
Epoch: 2 	Train: (l: 0.2925) 	Total Epoch Time: 9.6786s 	Total Alloc: 0.961 GiB 	GC Time: 9.6786s
Epoch: 3
Epoch: 3 	Train: (l: 0.2344) 	Total Epoch Time: 8.4488s 	Total Alloc: 0.961 GiB 	GC Time: 8.4488s
Epoch: 4
Epoch: 4 	Train: (l: 0.1871) 	Total Epoch Time: 9.9511s 	Total Alloc: 0.961 GiB 	GC Time: 9.9511s
Epoch: 5
Epoch: 5 	Train: (l: 0.1461) 	Total Epoch Time: 9.8260s 	Total Alloc: 0.960 GiB 	GC Time: 9.8260s


## Test sieci CNN

In [None]:
# Rozpoczęcie pomiaru czasu testowania
test_start_time = time.time()

model.eval()
correct_predictions = 0
total_samples = 0
test_loss = 0.0

with torch.no_grad():
    for texts, labels in test_dataloader:
        texts = texts.to(device)
        labels = labels.to(device)
        outputs = model(texts)

        # Obliczenie loss dla testu
        loss = criterion(outputs.squeeze(1), labels)
        test_loss += loss.item()

        # Obliczenie predykcji i accuracy
        predictions = (outputs > 0.5).float()
        correct_predictions += (predictions.squeeze(1) == labels).sum().item()
        total_samples += labels.size(0)

# Zakończenie pomiaru czasu
test_end_time = time.time()
test_duration = test_end_time - test_start_time

# Obliczenie średnich wartości
avg_test_loss = test_loss / len(test_dataloader)
accuracy = correct_predictions / total_samples * 100  # w procentach

# Wyświetlenie wyników w żądanym formacie
print(f"Test Loss (czas: {test_duration:.2f}s): {avg_test_loss:.4f}")
print(f"Test Accuracy: {accuracy} %")

Test Loss (czas: 0.53s): 0.3290
Test Accuracy: 87.71 %
