Section 4.3.3 & 4.3.4
Depending on the nature of my labels it may make more sense to use either 4.3.3 method or 4.3.4 method to keep track of the labels. For example, in the wine rating case they look at in the book, keeping the integer scores makes sense. Grape variety on the other hand would be something that would make sense to use one-hot encoding for. One-hot encoding also works for the cases when the scores are only integers and the values in between don't have a meaning.

In [1]:
import torch

target = torch.tensor([1,2,3,4,5,6,7,8,9])

In [3]:
target_onehot = torch.zeros(target.shape[0],10)

Remember that when the command ends in an underscore the method will not return a new tensor, but will modify the tensor in place

In [4]:
target_onehot.scatter_(1,target.unsqueeze(1),1.0)

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

--------------------------------------------------------------------------------------

### Section 4.5

In [5]:
import urllib.request
import zipfile
import os

# Define the URL of the zip file
zip_file_url = "https://gutenberg.org/files/1342/1342-0.zip"

# Define the file name after extraction
extracted_file_name = "1342-0.txt"

# Download the zip file
zip_file_path, _ = urllib.request.urlretrieve(zip_file_url, "data.zip")

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall("extracted_data")

# Open and read the text file
with open(os.path.join("extracted_data", extracted_file_name), "r") as file:
    content = file.read()



In [6]:
lines = content.split('\n')
line = lines[200]
line


'for there was a distinctly feminine element in “Mr. Spectator,” and in'

In [7]:
letter_t = torch.zeros(len(line),128)
letter_t.shape

torch.Size([70, 128])

In [8]:
for i, letter in enumerate(line.lower().strip()): #makes everything lower case and removes blank spaces
  letter_index = ord(letter) if ord(letter) < 128 else 0
  letter_t[i][letter_index] = 1

In [14]:
letter_t

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

### One-hot encoding whole words





In [9]:
def clean_words(input_str):
  punctuation = '.,;:"!?”“_-'
  word_list = input_str.lower().replace('\n',' ').split()
  word_list = [word.strip(punctuation) for word in word_list]
  return word_list

In [10]:
words_in_line = clean_words(line)
line , words_in_line

('for there was a distinctly feminine element in “Mr. Spectator,” and in',
 ['for',
  'there',
  'was',
  'a',
  'distinctly',
  'feminine',
  'element',
  'in',
  'mr',
  'spectator',
  'and',
  'in'])

In [11]:
word_list = sorted(set(clean_words(content)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}


In [13]:
len(word2index_dict), word2index_dict['feminine']

(7787, 2802)

In [15]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
  word_index = word2index_dict[word]
  word_t[i][word_index] = 1
  print('{:2} {:4} {}'.format(i, word_index, word))
print(word_t.shape)

 0 2915 for
 1 6897 there
 2 7460 was
 3  193 a
 4 2180 distinctly
 5 2802 feminine
 6 2374 element
 7 3647 in
 8 4600 mr
 9 6495 spectator
10  497 and
11 3647 in
torch.Size([12, 7787])


Clearly, one-hot encoding of words isnt the best approach. It already requires defining tensors with more than 7000 columns and if a new word were to be added we would have to add another column and retrain the model.

-------------------------------------------------------------------------------------------------------------------------------

## Exercises section 4

#### Ex 1

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The PIL library can load images in python. However, it is better to use imageio because this library is designed to load the images as np.arrays. Probably torch vision would be best in this case.

In [18]:
import imageio
import os
import numpy as np

data_path = '/content/drive/My Drive/Colab Notebooks/Colors'
image_list = []
batch_size = 3
batch = torch.zeros(batch_size, 3, 4080, 3072, dtype=torch.uint8)
filenames = [name for name in os.listdir(data_path) if os.path.splitext(name)[-1] == '.jpg']
for i, filename in enumerate(filenames):
        image_path = os.path.join(data_path, filename)
        image = imageio.imread(image_path)
        image_t = torch.from_numpy(image)
        image_t = image_t.permute(2,0,1)
        image_t = image_t[:3]
        batch[i] = image_t


  image = imageio.imread(image_path)


In [19]:
# convert batch data into float
batch = batch.float()

In [23]:
for c in range(batch_size):
  mean = torch.mean(batch[c,:])
  print('for image', c , 'the mean is ', mean.item())

for image 0 the mean is  91.8802719116211
for image 1 the mean is  93.59111022949219
for image 2 the mean is  114.51429748535156


In [24]:
n_channels = batch.shape[1]
for img in range(batch_size):
  for c in range(n_channels):
    mean = torch.mean(batch[img,c])
    print('for image', img, 'the mean is', mean.item())

for image 0 the mean is 140.0339813232422
for image 0 the mean is 64.24269104003906
for image 0 the mean is 71.36410522460938
for image 1 the mean is 100.17790222167969
for image 1 the mean is 98.92607116699219
for image 1 the mean is 81.66934967041016
for image 2 the mean is 113.93766021728516
for image 2 the mean is 115.5621337890625
for image 2 the mean is 114.04309844970703


#### Ex 2

In [25]:
python_file_path = '/content/drive/My Drive/Colab Notebooks/chapter4_ex2_aux_file.py'

with open(python_file_path, "r") as file:
    python_file = file.read()


In [26]:
code_lines = python_file.split('\n')
code_lines[10]

'from classy import Class'

In [29]:
def clean_code(input_str):
  special_chars = 'r"[^a-zA-Z0-9_]+"'
  code_list = input_str.replace('\n',' ').split()
  for i in range(len(code_list)):
    for char in special_chars:
      code_list[i] = code_list[i].replace(char, ' ')
  return code_list

In [30]:
trial_line = clean_code(code_lines[10])
code_lines[10], trial_line

('from classy import Class', ['f om', 'cl ssy', 'impo t', 'Cl ss'])

In [31]:
code = sorted(set(clean_code(python_file)))
code2index_dict = {element: i for (i, element) in enumerate(code)}

In [35]:
len(code2index_dict), code2index_dict['fo ']

(277, 207)

create one-hot encoding for the source code

In [45]:
## Define a function to one-hot encode the entire source file
## The output will be in batches to make it easier for a NN to import and manipulate
## For this reason I want all the batches to have the same length - add padding

def one_hot_encode_batch(python_file, code2index_dict):
  tokenized_lines = [line for line in clean_code(python_file)]
  max_sequence_length = max(len(line) for line in tokenized_lines)

  batch_size = len(tokenized_lines)
  num_classes = len(code2index_dict)
  batch = torch.zeros(batch_size, max_sequence_length,num_classes)

  for l, line in enumerate(tokenized_lines):
    for j, word in enumerate(line):
      code_index = code2index_dict.get(word, 0) #use 0 for unknown words
      batch[l, j, code_index] = 1

  return batch

In [46]:
test_batch = one_hot_encode_batch(python_file,code2index_dict)

In [48]:
test_batch.shape

torch.Size([674, 89, 277])

In [50]:
test_batch[0,1,:]

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 