### Data Preprocessing 
You can use your own way of preprocessing to enhance results. Best results will lead to bonus points.

In [None]:
import pandas as pd
import nltk
import tensorflow as tf
import torch
import torch.nn as nn
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
reviews = pd.read_csv(??)

In [None]:
def preprocess(text):
    lower = text.lower()
    # Removing Punctuation marks
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    rem_punc = tokenizer.tokenize(lower)
    # Removing Stop Words
    stopwords = nltk.corpus.stopwords.words('english')
    rem_stop_words = [word for word in rem_punc if not word in stopwords]
    # Removing Non-English words 
    english_words = nltk.corpus.words.words()
    english_words = [word for word in rem_stop_words if word in english_words]   
    # Insert Start End tokens
    english_words.insert(0,'<start>')
    english_words.append('<end>')
    sentence = ' '.join(english_words)
    return sentence

In [None]:
def encode_text(text):
    # Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(text)
    # Converting to sequences
    sequences = tokenizer.texts_to_sequences(text)
    # Padding Zeros 
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
    
    return padded_sequences, tokenizer

In [None]:
text = list(map(preprocess,reviews.review[:10]))
encodings, tokenizer = encode_text(text)

In [None]:
X = encodings
y = np.where(np.array(reviews.sentiment)=='positive',1,0)

### Sequence models
Following is an example code for simple LSTMs containing one layer. Your implementation should be generic in which user can be able to create multiple layers if required. 

In [None]:
class LSTM(nn.Module):
    
    def __init__(self,input_dim, hidden_dim):

        super().__init__()
        # use pytorch tensors
        # require_grad parameter should be True 
        # Instead of following you can also use nn.Linear()
        self.input_dim  = ??
        self.hidden_dim = ??
        self.W_x = ??
        self.W_h = ??
        self.b_x = ??
        self.b_h = ??

    def lstm_step(self, inp, prev_hidden_cell):     
        
        h_prev , c_prev = prev_hidden_cell
        
        # The activation vector
        activation = ??
        
        # The activation is split into four parts
        ai, af, ac, ao = ??

        updated_h, updated_c = None, None
      
        # TODO: Implement the gates of lstm and update hidden state and cell state

        in_gate     = ??
        forget_gate = ??
        cell_gate   = ??
        out_gate    = ??

        updated_c  = ??
        updated_h  = ??
        
        return updated_h, updated_c


    def forward(self, inp):

        # Initialize hidden state with zeros (batch_size, hidden_dim)
        h = ??
        
        # Initialize cell state with zeros (batch_size, hidden_dim)
        c = ??
        
        # Loop through the whole sequence and update h_t and c_t at every time step
        # Shape of x is (batch_size, dim)
        for x in inp:
            h, c = self.lstm_step( x, (h, c) )
        
        return h 

In [None]:
model = LSTM(??)
loss = ??
lr = ??
optimizer = ??
# in training function you can use same functions for back prop as you used in assignment 1
# loss.backward()
# optimizer.step()
# for testing and validation use with torch.no_grad()

### Word Cloud

In [None]:
# Word Cloud that shows the positive and negative words and thier frequence 
# make a dictionary of words with their word_count
counts = tokenizer.word_counts
# use you trained model to prodict sentiment of these words
??
# Saperate positive sentiment words and negative sentiment words with their counts 
??
# make word cloud according
plt.figure(figsize=(??,??))
wordcloud = WordCloud(background_color=??,max_words=??).generate_from_frequencies(??)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Word Cloud that shows main words used in reviews that are positive or negative  
# Randomly select one sentence that has positive sentiment and one with negative sentiment 
??
# make word cloud of those sentences and see what are main words used in positive review and in negative reviews
# Similarly you can make word clouds of all positive reviews and negative reviews to see the most repeating words in positive sentiments and negative sentiments
plt.figure(figsize=(??,??))
plt.title('Prediceted Sentiment on this review: '+??)
wordcloud = WordCloud(background_color='White',max_words=100).generate(??)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()