<a href="https://colab.research.google.com/github/Bohdan-at-Kulinich/Bohdan-at-Kulinich/blob/main/Class_Regress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Loading the imdb dataset  

 # keep only the top 10,000 most frequently occuring words in the training data
 # rare words are discarded to work with the data of manageable size
 # the whole ds contains 85,585 unique words, some occuring in a single sample
 # which can't be meaninfully used in classification

from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) 


In [None]:
 # train_data and test_data are lists of reviews
 # each review is a list of word indices
 # train_labes and test_labels are lists of 0 (negative) and 1 (positive)

train_data[0] 
train_labels[0] 


In [5]:
# the max value of the word index:
# find the max val for each seq and then extract the max from the final list


max([max(sequence) for sequence in train_data]) 

9999

In [16]:
# decoding reviews back to words:

# create a dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# reverse the dictionary, mapping indices to words
reverse_word_index = dict (
    [(value, key) for (key, value) in word_index.items()])

# indices are offset by 3 because 0, 1, 2 are reserved for 'padding', 'start of sequence', and 'unknown'
decoded_review = " ".join(
    [reverse_word_index.get(i-3, "?") for i in train_data[1]])



In [None]:
print(word_index)

In [None]:
print(reverse_word_index)

In [None]:
print(decoded_review)

### Preparing the data 

In [18]:
# we need to transform the lists of integers into tensors:
# 1) pad the lists to make them of the same length and turn them into an integer tensor of shape (samples, max_length)
# start the model with the Embedding layer
# 2) multi-hot encode the lists to turn them into vectors of 0, 1. 
# use the Dense layer 

In [25]:
# Encoding the integer sequences via multi-hot encoding 

import numpy as np 
def vectroize_sequences(sequences, dimension=10000):
  # create an all-zero matrix
  results = np.zeros((len(sequences), dimension))
  # set specific indices of results matrix to 1
  for i, sequence in enumerate(sequences): # label each sequence in the data with an integer
    for j in sequence:  # go over each index in the sequence list  
      results[i, j] = 1
  return results 

# vectorize the training and test data 
x_train = vectroize_sequences(train_data)
x_test  = vectroize_sequences(test_data) 

# vectorize the labels
y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

### Building the model 

In [26]:
# the input data is vectors, the labels are scalars (0, 1)
# such kind of situations is handled well with a plain stack of densely connected (Dense) layers with relu activation 

# Architecture: three-layer model 
# two intermediate layers with 16 units each
# third layer to output the scalar predictions 

from tensorflow import keras 
from tensorflow.keras import layers 

# each Dense layer with a relu activation implements the chain of tensor operations:
# output = relu(dot(input, W) + b)
# with 16 units the weight matrix W will have shape (input_dimension, 16)
# relu (rectified linear unit) is meant to zero out negative values
# sigmoid squashes arbitrary values into the [0, 1] interval, outputing something that can be interpreted as probability 
model = keras.Sequential([ 
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])