In [10]:
# loading the data text
with open("data/Philosophy by Bertrand Russell.txt", 'r', encoding="utf-8") as f:
    text = f.read();

print(len(text))

628096


In [11]:
# tokenize

chars = sorted(set(text))
print(chars)

['\n', ' ', '!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '²', '·', 'Æ', '×', 'æ', 'è', 'ö', 'œ', 'α', 'β', 'γ', '–', '‘', '’', '“', '”', '′', '\ufeff']


In [12]:
#encode

string_to_int = {char: index for index, char in enumerate(chars)}
int_to_string = {index: char for index, char in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join(int_to_string[i] for i in l)

print(encode("hello"))
print(decode(encode("hello")))

[64, 61, 68, 68, 71]
hello


In [13]:
import torch

# creating tensors of text
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:10])

tensor([101,  47,  64,  61,   1,  43,  74,  71,  66,  61])


In [14]:
# splitting train and validation
n = int(len(data) * 0.8)
train_data = data[:n]
val_data = data[n:]

print(n, len(data)-n)

502476 125620


In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Comparison with torch and numpy

In [18]:
import numpy as np
import time

In [22]:
torch_rand1 = torch.rand(10000,10000).to(device)
torch_rand2 = torch.rand(10000,10000).to(device)
np.rand1 = torch.rand(10000,10000)
np.rand2 = torch.rand(10000,10000)

start_time = time.time()

# multiply
result = torch_rand1 @ torch_rand2

elapsed_time = time.time() - start_time
print(f"eslapsed time for torch: {elapsed_time:.8f}")

start_time = time.time()

# multiply
result = np.rand1 @ np.rand2

elapsed_time = time.time() - start_time
print(f"eslapsed time for numpy: {elapsed_time:.8f}")

eslapsed time for torch: 0.04200673
eslapsed time for numpy: 1.04733181


In [23]:
torch_rand1 = torch.rand(100,100,100,100).to(device)
torch_rand2 = torch.rand(100,100,100,100).to(device)
np.rand1 = torch.rand(100,100,100,100)
np.rand2 = torch.rand(100,100,100,100)

start_time = time.time()

# multiply
result = torch_rand1 @ torch_rand2

elapsed_time = time.time() - start_time
print(f"eslapsed time for torch: {elapsed_time:.8f}")

start_time = time.time()

# multiply
result = np.rand1 @ np.rand2

elapsed_time = time.time() - start_time
print(f"eslapsed time for numpy: {elapsed_time:.8f}")

eslapsed time for torch: 0.05235982
eslapsed time for numpy: 0.10469270
