# Group 11 DL Project - Part A 

# Imports 

In [1]:
import gensim
import tqdm
import math
import numpy as np
import pandas as pd
import sklearn as sk
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import KFold
from os import listdir
from os.path import isfile, join
import re
import nltk
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))
####
from sklearn.model_selection import train_test_split
import torch.optim as optim
import time
from sklearn.metrics import roc_curve, auc


## Data Preprocessing

In [2]:
# Now we will make a new dataset from the files with positive and negative reviews

def make_clean_comments(dir_path, comment_type):
    #get all the files from the directory
    onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
    files_paths = [dir_path + "\\" + f for f in onlyfiles]
    
    comments_tokens = [] #{[tokens_from_1_file]:0/1,[tokens_from_2_file]...} e.t.c
    regex = re.compile('[^a-zA-Z]') #regex for removing all the symbols that are not letters
    
    #for each comment(=file in pos/neg directory)
    for path_to_file in files_paths:
        with open(path_to_file, encoding="utf8") as f:
            full_comment = f.read() #the data from file with comments
            words = full_comment.split() 
            clean_list = [] #this list will contain only words that doesn't have symbols other than letters
            
            #for each words in comment (=file)
            for w in words:
                clean_word_data = regex.sub('', w) #clean the word by regex
                
                if len(clean_word_data) > 1:
                    #cleaning the stopwords from the sentence
                    clean_word_data = clean_word_data.lower()
                    #checking that the word is not a stop word (=a word that shows up frequently)
                    if clean_word_data not in en_stops:
                        clean_list.append(clean_word_data) #creates a clean comment
                        
        comments_tokens.append(clean_list)
        #creating list of comment type for the later creation of dataset
    if comment_type == "pos":          
        comments_type = [1 for i in range(len(comments_tokens))]
    else:
        comments_type = [0 for i in range(len(comments_tokens))]

    return comments_tokens, comments_type

## Positive comments tokenization ##
pos_dir_path = fr'C:\Users\YuvalZiv\Desktop\nlp project\train\pos'
pos_tokens_sentences, comments_type_list_pos = make_clean_comments(pos_dir_path, "pos")
## Negative comments tokenization ##
neg_dir_path = fr'C:\Users\YuvalZiv\Desktop\nlp project\train\neg'
neg_tokens_sentences, comments_type_list_neg = make_clean_comments(neg_dir_path, "neg")
    

In [3]:
#preparing the labled data
data_pos = {'comment':pos_tokens_sentences,
        'lable':comments_type_list_pos}
data_neg = {'comment':neg_tokens_sentences,
        'lable':comments_type_list_neg}

#preparing data for embedding
data_for_embedding = []
for s in pos_tokens_sentences:
    data_for_embedding.append(s)
for s in neg_tokens_sentences:
    data_for_embedding.append(s)


#creating a dataset of all the data together
dataset = pd.DataFrame(data_pos)
df_temp = pd.DataFrame(data_neg)
dataset.append(df_temp)


Unnamed: 0,comment,lable
0,"[bromwell, high, cartoon, comedy, ran, time, p...",1
1,"[homelessness, houselessness, george, carlin, ...",1
2,"[brilliant, overacting, lesley, ann, warren, b...",1
3,"[easily, underrated, film, inn, brooks, cannon...",1
4,"[typical, mel, brooks, film, much, less, slaps...",1
...,...,...
18745,"[towards, end, movie, felt, technical, felt, l...",0
18746,"[kind, movie, enemies, content, watch, time, b...",0
18747,"[saw, descent, last, night, stockholm, film, f...",0
18748,"[films, pick, pound, turn, rather, good, rd, c...",0


In [5]:
# Training the embedding models
from gensim.models import FastText
model_w2v = gensim.models.word2vec.Word2Vec(sentences=data_for_embedding, size=100, window=5, min_count=1, workers=4)
if torch.cuda.is_available():
    model_w2v.cuda()
model_ft = FastText(sentences=data_for_embedding, size=100, window=5, min_count=1, workers=4)
if torch.cuda.is_available():
    model_w2v.cuda()
w2v_weights = torch.FloatTensor(model_w2v.wv.vectors)
# Implementing the models into a list
models_lst=[model_w2v,model_ft]

embed_model = models_lst[0]

In [8]:
w2v_y=[]
w2v_X=[]
for index, row in dataset.iterrows():
    w2v_X.append(model_w2v.wv[row["comment"]])
w2v_y=dataset['lable'].to_list

In [9]:
w2v_X

[array([[-0.05930763, -0.13205804,  0.01484363, ...,  0.09772535,
         -0.02474234, -0.05056662],
        [ 0.6573876 , -0.4099651 , -2.414823  , ..., -0.04281988,
         -2.179221  , -0.8884415 ],
        [ 1.3866792 , -0.5271607 ,  0.75765157, ..., -1.0563396 ,
         -0.5964049 , -0.14948183],
        ...,
        [ 0.3076118 , -0.30062005, -0.5986949 , ..., -0.17266755,
         -0.0645469 ,  0.05038321],
        [ 0.27888092,  0.16152965, -0.12113065, ...,  0.26934773,
          0.09262791,  0.16256368],
        [ 0.9969112 , -2.5668886 , -1.4325025 , ...,  0.31459516,
         -1.0294696 ,  1.3644029 ]], dtype=float32),
 array([[ 1.83817744e-02,  2.62314435e-02,  5.54550104e-02, ...,
          9.58180055e-03,  3.44996788e-02,  7.60520995e-02],
        [-6.35785423e-03,  9.96026117e-03, -1.26321223e-02, ...,
         -1.96342706e-03,  1.27366744e-03,  1.72542175e-03],
        [ 4.28848043e-02,  4.69768226e-01, -3.30831796e-01, ...,
         -9.40559685e-01, -2.80003220e-01

In [10]:
class GRUCell(nn.Module):

    """
    An implementation of GRUCell.

    """

    def __init__(self, input_size, hidden_size, bias=True):
        super(GRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.x2h = nn.Linear(input_size, 3 * hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, 3 * hidden_size, bias=bias)
        self.reset_parameters()



    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)
    
    def forward(self, x, hidden):
        
        x = x.view(-1, x.size(1))
        
        gate_x = self.x2h(x) 
        gate_h = self.h2h(hidden)
        
        gate_x = gate_x.squeeze()
        gate_h = gate_h.squeeze()
        
        i_r, i_i, i_n = gate_x.chunk(3, 1)
        h_r, h_i, h_n = gate_h.chunk(3, 1)
        
        
        resetgate = F.sigmoid(i_r + h_r)
        inputgate = F.sigmoid(i_i + h_i)
        newgate = F.tanh(i_n + (resetgate * h_n))
        
        hy = newgate + inputgate * (hidden - newgate)
        
        
        return hy

In [11]:
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim,w_embeddings, bias=True):
        super(GRUModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(w_embeddings)
        # Number of hidden layers
        self.layer_dim = layer_dim
         
       
        self.gru_cell = GRUCell(input_dim, hidden_dim, layer_dim)
        
        
        self.fc = nn.Linear(hidden_dim, output_dim)
     
    
    
    def forward(self, x):
        
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        #print(x.shape,"x.shape")100, 28, 28
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
        else:
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
         
       
        outs = []
        
        hn = h0[0,:,:]
        
        for seq in range(x.size(1)):
            hn = self.gru_cell(x[:,seq,:], hn) 
            outs.append(hn)
            

        out = outs[-1].squeeze()
        
        out = self.fc(out) 
        # out.size() --> 100, 10
        
        x_embeded = self.embedding(x)
        pred = self.gru_cell(x_embeded)
        return pred
 

In [12]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''


input_dim = 28
hidden_dim = 128
layer_dim = 1  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 2
 
GRU_model = GRUModel(input_dim, hidden_dim, layer_dim, output_dim,w2v_weights)

#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()
     
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.BCELoss()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.01
 

In [14]:
'''
STEP 7: TRAIN THE MODEL
'''
def training_loop(data_input,label_input, data_size, val, classifier, epochs, criterion):
    X_input=torch.LongTensor([seq for seq in data_input])
    y=torch.LongTensor([label for label in label_input])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=val, random_state=42)
    optimizer = optim.Adam(model_name.parameters(), lr = 0.01, weight_decay = 1e-3)
    k_folds=5
    kfold = KFold(n_splits=k_folds, shuffle=True)
    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
        # Sample elements randomly from a given list of ids, no replacement.
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

        # Define data loaders for training and testing data in this fold
        trainloader = torch.utils.data.DataLoader(
                          dataset, 
                          batch_size=10, sampler=train_subsampler)
        testloader = torch.utils.data.DataLoader(
                          dataset,
                          batch_size=10, sampler=test_subsampler)    

        # Number of steps to unroll
        seq_dim = 28 

        loss_list = []
        iter = 0
        for epoch in range(num_epochs):
            for i, (images, labels) in enumerate(train_subsampler):
                # Load images as Variable
                #######################
                #  USE GPU FOR MODEL  #
                #######################

                if torch.cuda.is_available():
                    images = Variable(images.view(-1, seq_dim, input_dim).cuda())
                    labels = Variable(labels.cuda())
                else:
                    images = Variable(images.view(-1, seq_dim, input_dim))
                    labels = Variable(labels)

                # Clear gradients w.r.t. parameters
                optimizer.zero_grad()

                # Forward pass to get output/logits
                # outputs.size() --> 100, 10
                outputs = model(images)

                # Calculate Loss: softmax --> cross entropy loss
                loss = criterion(outputs, labels)

                if torch.cuda.is_available():
                    loss.cuda()

                # Getting gradients w.r.t. parameters
                loss.backward()

                # Updating parameters
                optimizer.step()

                loss_list.append(loss.item())
                iter += 1

                # Calculate Accuracy         
                correct = 0
                total = 0
                # Iterate through test dataset
                for images, labels in test_loader:
                    #######################
                    #  USE GPU FOR MODEL  #
                    #######################
                    if torch.cuda.is_available():
                        images = Variable(images.view(-1, seq_dim, input_dim).cuda())
                    else:
                        images = Variable(images.view(-1 , seq_dim, input_dim))

                    # Forward pass only to get logits/output
                    outputs = model(images)

                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    #######################
                    #  USE GPU FOR MODEL  #
                    #######################
                    if torch.cuda.is_available():
                        correct += (predicted.cpu() == labels.cpu()).sum()
                    else:
                        correct += (predicted == labels).sum()

                accuracy = 100 * correct / total

                # Print Loss
                print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

In [15]:
train_model=training_loop(w2v_X,w2v_y, 1000, 0.2, GRU_model, 10, criterion)

  X_input=torch.LongTensor([seq for seq in data_input])
  X_input=torch.LongTensor([seq for seq in data_input])


ValueError: expected sequence of length 71 at dim 1 (got 215)