# Workshop on Recurrent networks 

## 1. Define some constants and some imports

In [1]:
no_reviews = 21967    # no of reviews that will be read from file. 
max_review_length = 100 # no of words per review.  reviews will be  truncated or padded to be of this length.
max_words = 5000        # this is the size of the index (i.e. most common top words that will be used as features)
                        # note code assumes there are enough words in reviews.
embedding_dim = 100     # length of embedding based on Glove
validation_prop = 0.2   # prop of data for validation set
no_epochs =   10         # No of training cycles for the networks
batch_size = 128        # batch size for training

In [2]:
import os
import gzip
import numpy as np

## 2. Read the Data

In [3]:
import csv
import pandas as pd

def read_data(file_path, no_reviews):
    # Split the data into input (review text) and output (rating)
    reviews = []
    ratings = []
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row if it exists
        for row in csv_reader:
            # Assuming stars is at index 3 and review_body is at index 5
            ratings.append(row[3])
            reviews.append(row[5])

    return reviews, ratings
                                                              

In [4]:
file_path = "./reviews_for_classification.csv"

reviews, ratings = read_data(file_path, no_reviews)

print(f"Read in {len(ratings)} lines from {file_path}")
      
for i in range(6):
    print(f"{i} Rating: {ratings[i]}: {reviews[i]}")

Read in 21966 lines from ./reviews_for_classification.csv
0 Rating: 5: The best in all that matters! It's a great platform, easy and simple to use, and beginner-friendly. The only one in crypto that offers you to actually call a phone number and get to someone with your questions just like that. Highly recommend! I have been a customer for more than a year and I have only good things to say about Celsius network.
1 Rating: 5: If you are looking for the best #HomeForCrypto and where to earn steady yield then there is no better place than Celsius Network. The app is easy to use and understand with a recent update as well. The company is led by a great CEO who is fully engaged with the Celsius community! The mission of Celsius is to do good then do well and they live up to that very much. Rewards compound and pay out weekly!
2 Rating: 1: I despise it so much. Transferring to other wallets is difficult, especially because you cannot swap your coins, and there is a long waiting period when 

## 3. Pre-process the Data

In [5]:
# Use the tokenizer to code the reviews

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens')

sequences = tokenizer.texts_to_sequences(reviews)

print("Line 1:",reviews[1])
print("\nCoded : ", sequences[1])


Found 15368 unique tokens
Line 1: If you are looking for the best #HomeForCrypto and where to earn steady yield then there is no better place than Celsius Network. The app is easy to use and understand with a recent update as well. The company is led by a great CEO who is fully engaged with the Celsius community! The mission of Celsius is to do good then do well and they live up to that very much. Rewards compound and pay out weekly!

Coded :  [52, 16, 35, 407, 10, 3, 105, 4, 225, 2, 461, 3047, 776, 84, 72, 8, 27, 192, 405, 103, 159, 520, 3, 64, 8, 33, 2, 46, 4, 311, 12, 5, 1134, 624, 47, 172, 3, 82, 8, 2475, 97, 5, 44, 859, 158, 8, 1337, 4353, 12, 3, 159, 667, 3, 3048, 11, 159, 8, 2, 45, 76, 84, 45, 172, 4, 7, 451, 57, 2, 15, 40, 165, 366, 3049, 4, 116, 59, 680]


In [6]:
data = pad_sequences(sequences, maxlen= max_review_length)

ratings = np.asarray(ratings)
print('Shape of Data =', data.shape)
print('Shape of Labels =', ratings.shape)

Shape of Data = (21966, 100)
Shape of Labels = (21966,)


In [7]:
len_val = int(len(data) * validation_prop)

x_val = data[:len_val]
partial_x_train = data[len_val:]

y_val = ratings[:len_val]
partial_y_train = ratings[len_val:]

print('Length of validation set =', len(x_val))
print('Length of training set =', len(partial_x_train))
      

Length of validation set = 4393
Length of training set = 17573


## 4. Load the Glove Embeddings

In [8]:
glove_dir = './Glove 6B'

embeddings_index = {}

f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print('no of words in glove embeddings =', len(embeddings_index))


no of words in glove embeddings = 400000


In [9]:
#look for word embeddings

embedding_matrix = np.zeros((max_words, embedding_dim))

for word,i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        

print("shape of embeddings matrix is:",  embedding_matrix.shape)

# print some entries
    
for word,i in word_index.items():
    if i > 10: break 
    print(f'{i}:{word}\t--> { embedding_matrix[i, 0:6]}')



shape of embeddings matrix is: (5000, 100)
1:i	--> [-0.046539    0.61966002  0.56647003 -0.46584001 -1.18900001  0.44599   ]
2:to	--> [-0.18970001  0.050024    0.19084001 -0.049184   -0.089737    0.21006   ]
3:the	--> [-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953  ]
4:and	--> [-0.071953    0.23127     0.023731   -0.50638002  0.33923     0.19589999]
5:a	--> [-0.27085999  0.044006   -0.02026    -0.17395     0.6444      0.71213001]
6:my	--> [ 0.080273   -0.10861     0.72066998 -0.45135999 -0.74959999  0.63782001]
7:they	--> [-0.07954     0.30171001  0.079516   -0.74662    -0.67878997  0.35029   ]
8:is	--> [-0.54263997  0.41475999  1.03219998 -0.40244001  0.46691     0.21816   ]
9:it	--> [-0.30664     0.16821     0.98510998 -0.33605999 -0.24160001  0.16186   ]
10:for	--> [-0.14401001  0.32554001  0.14257    -0.099227    0.72535998  0.19321001]


## 5. Define the network

In [10]:
print(embedding_matrix.shape)
print(max_words)
print(embedding_dim)

(5000, 100)
5000
100


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False


model.summary()

ValueError: You called `set_weights(weights)` on layer 'embedding' with a weight list of length 1, but the layer was expecting 0 weights.

##  6. Plot results