# Group 11 DL Project - Part A 

# Imports 

In [24]:
import gensim
import tqdm
import math
import numpy as np
import pandas as pd
from torch.autograd import Variable
import sklearn as sk
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import KFold
from os import listdir
from os.path import isfile, join
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
en_stops = set(stopwords.words('english'))
####
from sklearn.model_selection import train_test_split
import torch.optim as optim
import time
from sklearn.metrics import roc_curve, auc
from torch.utils.tensorboard import SummaryWriter


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gilad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'tensorboard'

## Data Preprocessing

In [4]:
# Now we will make a new dataset from the files with positive and negative reviews

def make_clean_comments(dir_path, comment_type):
    #get all the files from the directory
    onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
    files_paths = [dir_path + "\\" + f for f in onlyfiles]
    
    comments_tokens = [] #{[tokens_from_1_file]:0/1,[tokens_from_2_file]...} e.t.c
    regex = re.compile('[^a-zA-Z]') #regex for removing all the symbols that are not letters
    
    #for each comment(=file in pos/neg directory)
    for path_to_file in files_paths:
        with open(path_to_file, encoding="utf8") as f:
            full_comment = f.read() #the data from file with comments
            words = full_comment.split() 
            clean_list = [] #this list will contain only words that doesn't have symbols other than letters
            
            #for each words in comment (=file)
            for w in words:
                clean_word_data = regex.sub('', w) #clean the word by regex
                
                if len(clean_word_data) > 1:
                    #cleaning the stopwords from the sentence
                    clean_word_data = clean_word_data.lower()
                    #checking that the word is not a stop word (=a word that shows up frequently)
                    if clean_word_data not in en_stops:
                        clean_list.append(clean_word_data) #creates a clean comment
                        
        comments_tokens.append(clean_list)
        #creating list of comment type for the later creation of dataset
    if comment_type == "pos":          
        comments_type = [1 for i in range(len(comments_tokens))]
    else:
        comments_type = [0 for i in range(len(comments_tokens))]

    return comments_tokens, comments_type

## Positive comments tokenization ##
pos_dir_path = fr"D:\School\YearD\SemesterB\DeepLearning2\Project\train\pos"
pos_tokens_sentences, comments_type_list_pos = make_clean_comments(pos_dir_path, "pos")
## Negative comments tokenization ##
neg_dir_path = fr"D:\School\YearD\SemesterB\DeepLearning2\Project\train\neg"
neg_tokens_sentences, comments_type_list_neg = make_clean_comments(neg_dir_path, "neg")
    

In [5]:
#preparing the labled data
data_pos = {'comment':pos_tokens_sentences,
        'lable':comments_type_list_pos}
data_neg = {'comment':neg_tokens_sentences,
        'lable':comments_type_list_neg}

#preparing data for embedding
data_for_embedding_w2v = []
data_for_embedding_ft = []
max_sentence = 0
for s in pos_tokens_sentences:
    if len(s) > max_sentence:
        max_sentence = len(s)
    data_for_embedding_w2v.append(s)
    data_for_embedding_ft.append(s)
for s in neg_tokens_sentences:
    if len(s) > max_sentence:
        max_sentence = len(s)
    data_for_embedding_w2v.append(s)
    data_for_embedding_ft.append(s)




In [6]:
#creating a dataset of all the data together
df_temp_1 = pd.DataFrame(data_pos, index=range(0,18750))
df_temp_2 = pd.DataFrame(data_neg, index=range(18750,37500))
dataset = pd.concat([df_temp_1,df_temp_2])


dataset.shape

(37500, 2)

In [7]:
dataset

Unnamed: 0,comment,lable
0,"[bromwell, high, cartoon, comedy, ran, time, p...",1
1,"[homelessness, houselessness, george, carlin, ...",1
2,"[brilliant, overacting, lesley, ann, warren, b...",1
3,"[easily, underrated, film, inn, brooks, cannon...",1
4,"[typical, mel, brooks, film, much, less, slaps...",1
5,"[isnt, comedic, robin, williams, quirkyinsane,...",1
6,"[yes, art, successfully, make, slow, paced, th...",1
7,"[critically, acclaimed, psychological, thrille...",1
8,"[night, listener, robin, williams, toni, colle...",1
9,"[know, robin, williams, god, bless, constantly...",1


### Embeddings training

In [8]:
# Training the embedding models
from gensim.models import FastText

model_w2v = gensim.models.word2vec.Word2Vec(sentences=data_for_embedding_w2v, vector_size=100, window=5, min_count=1, workers=4)
model_w2v.train(data_for_embedding_w2v, total_examples=model_w2v.corpus_count, epochs=30)
if torch.cuda.is_available():
    model_w2v.cuda()
    
model_ft = FastText(sentences=data_for_embedding_ft, vector_size=100, window=5, min_count=1, workers=4)
model_ft.train(data_for_embedding_ft, total_examples=model_ft.corpus_count, epochs=30)
if torch.cuda.is_available():
    model_ft.cuda()
    


#w2v_weights = torch.FloatTensor(model_w2v.wv.vectors)
# Implementing the models into a list
#models_lst=[model_w2v,model_ft]

#embed_model = models_lst[0]

In [11]:
''' Padding the Embeddings '''

def make_indexed_torch(comment, embed_model):
    if embed_model == "w2v":
        
        return torch.LongTensor([model_w2v.wv.get_index(word) for word in comment])
    else:
        return torch.LongTensor([model_ft.wv.get_index(word) for word in comment])

def padding_tensors_to_the_same_dim(com_torch, target_size):
    nump_torch = com_torch.numpy()
    pad_size = target_size - nump_torch.size
    nump_torch = np.pad(nump_torch, (0,pad_size), 'constant')
    return torch.from_numpy(nump_torch)
    
    
    
comments = dataset['comment']
w2v_X_pp = comments.apply(lambda x: make_indexed_torch(x, "w2v"))
w2v_X = w2v_X_pp.apply(lambda x: padding_tensors_to_the_same_dim(x, max_sentence))
w2v_y = dataset['lable']


#smth = make_indexed_torch(data_for_embedding_w2v[1],"w2v")
#smth2 = make_indexed_torch(data_for_embedding_ft[1],"ft")
#print(smth)
#print(smth2)
"""
w2v_X_pp = data_for_embedding.apply(lambda x: make_indexed_torch(x, "w2v"))
w2v_X = w2v_X_pp.values

w2v_y=dataset['lable'].to_list

ft_X_pp = data_for_embedding.apply(lambda x: make_indexed_torch(x, "ft"))
ft_X = ft_X_pp.values

ft_y=dataset['lable'].to_list
"""

'\nw2v_X_pp = data_for_embedding.apply(lambda x: make_indexed_torch(x, "w2v"))\nw2v_X = w2v_X_pp.values\n\nw2v_y=dataset[\'lable\'].to_list\n\nft_X_pp = data_for_embedding.apply(lambda x: make_indexed_torch(x, "ft"))\nft_X = ft_X_pp.values\n\nft_y=dataset[\'lable\'].to_list\n'

In [16]:
### Getting the weights from the training ###
w2v_weights = torch.FloatTensor(model_w2v.wv.vectors)
ft_weights = torch.FloatTensor(model_ft.wv.vectors)

## Building the Network

In [13]:
class LSTM(nn.Module):
    def __init__(self, input_sz: int, hidden_sz: int, label_sz: int):
        super().__init__()
        self.input_size = input_sz
        self.hidden_size = hidden_sz
        self.label_size = label_sz

        self.U_i = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_i = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_i = nn.Parameter(torch.Tensor(hidden_sz))

        self.U_f = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_f = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_f = nn.Parameter(torch.Tensor(hidden_sz))

        self.U_c = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_c = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_c = nn.Parameter(torch.Tensor(hidden_sz))

        self.U_o = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_o = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_o = nn.Parameter(torch.Tensor(hidden_sz))

        self.hidden2label = nn.Parameter(torch.Tensor(hidden_sz, label_sz))
        
        self.init_weights()
    
    def init_weights(self):
            stdv = 1.0 / math.sqrt(self.hidden_size)
            for weight in self.parameters():
                weight.data.uniform_(-stdv, stdv)
                
    
    def forward(self, x, init_states=None):
            """
            assumes x.shape represents (batch_size, sequence_size, input_size)
            """
            seq_sz, embeded_sz = x.size()
            
            if init_states is None:
                h_t, c_t = (
                    torch.zeros(self.hidden_size).to(x.device),
                    torch.zeros(self.hidden_size).to(x.device),
                )
            else:
                h_t, c_t = init_states

            #loop over all words in sequence
            
            for t in range(seq_sz):
                x_t = x[t]
                i_t = torch.sigmoid(x_t @ self.U_i + h_t @ self.V_i + self.b_i)
                f_t = torch.sigmoid(x_t @ self.U_f + h_t @ self.V_f + self.b_f)
                g_t = torch.tanh(x_t @ self.U_c + h_t @ self.V_c + self.b_c)
                o_t = torch.sigmoid(x_t @ self.U_o + h_t @ self.V_o + self.b_o)
                c_t = f_t * c_t + i_t * g_t
                h_t = o_t * torch.tanh(c_t)
                    
                
                # last word:
                if t==seq_sz-1:
                    dropout = nn.Dropout(p=0.2)
                    out_ht = dropout(h_t)
                    y_pred = torch.sigmoid(out_ht @ self.hidden2label)
            
            return  y_pred

In [14]:
class Net(nn.Module):
    def __init__(self,input_sz: int, hidden_sz: int, label_sz: int, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights)
        self.lstm = LSTM(input_sz, hidden_sz, label_sz)
    
    def forward(self,x):
        x_embeded = self.embedding(x)
        pred = self.lstm(x_embeded)
        return pred

## Training the model

In [35]:
'''
Creating the training loop
'''
def run_model_loop(X_input,y_input, data_size, test_p, classifier, epochs, criterion, optimizer,tb_dirname):
    
    X = X_input[:data_size]
    y = y_input[:data_size]
    y = torch.tensor(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_p, random_state=42)
    
    train_loss_list = []
    test_loss_list = []
    train_auc_list=[]
    test_auc_list=[]

#     writer = SummaryWriter(log_dir=f'{tb_dir}/{tb_dirname}_{time.time()}')
    j = 0

    for e in range(epochs):
        train_labels_list = []
        train_prediction_list = []
        test_labels_list = []
        test_prediction_list = []

        running_loss = 0
        classifier.train()

        for i, review in enumerate(X_train):
            j+=1
            label = y_train[i].view(1)
            review, label = review.to(device), label.to(device)
            optimizer.zero_grad()
            #use model to predict sentiment
            pred = classifier(review).to(torch.float)
            #to be safe 
            label = label.to(torch.float)
            loss = criterion(pred, label)
            #calculate loss
            running_loss+=loss.item()
            loss.backward()
            optimizer.step()
            train_prediction_list.append(pred.item())
            train_labels_list.append(label.item())
        
#             writer.add_scalar(tag='loss/batch_loss_train', scalar_value=loss, global_step=j)

        epoch_train_loss = running_loss/len(X_train)
        train_loss_list.append(epoch_train_loss)
        fpr_t, tpr_t, thresholds_t = roc_curve(np.array(train_labels_list), np.array(train_prediction_list))
        train_auc = auc(fpr_t, tpr_t)
        train_auc_list.append(train_auc)
        
        test_loss = 0
        classifier.eval()
        # we dont need to update weights, so we define no_grad() to save memory
        with torch.no_grad():
            for i, review in enumerate(X_test):
                label= y_test[i].view(1)
                review, label = review.to(device), label.to(device)
                test_pred = classifier(review).to(torch.float)
                label = label.to(torch.float)
                loss=criterion(test_pred, label)
                test_loss+=loss.item()
                test_prediction_list.append(test_pred.item())
                test_labels_list.append(label.item())
        
        epoch_test_loss = test_loss/len(X_test)
        test_loss_list.append(epoch_test_loss)
        fpr_v, tpr_v, thresholds_v = roc_curve(np.array(test_labels_list),np.array(test_prediction_list))
        test_auc = auc(fpr_v, tpr_v)
        test_auc_list.append(test_auc)
        
        #Tensorboard documentation
        #loss
#         writer.add_scalar(tag='loss/train', scalar_value=epoch_train_loss, global_step=e)
#         writer.add_scalar(tag='loss/test', scalar_value=epoch_test_loss, global_step=e)
#         #auc
#         writer.add_scalar(tag='auc/train', scalar_value=train_auc, global_step=e)
#         writer.add_scalar(tag='auc/test', scalar_value=test_auc, global_step=e)

    return {'train_loss':train_loss_list,
            'train_auc':train_auc_list,
            'test_auc':test_auc_list,
            'test_loss':test_loss_list,
            'fpr_v':fpr_v,
            'tpr_v':tpr_v
            }


In [33]:
w2v_X.values
# w2v_X = X_w2v_full.values

array([tensor([28776,   213,   944,  ...,     0,     0,     0]),
       tensor([26430, 93382,   616,  ...,     0,     0,     0]),
       tensor([  412,  3601, 17375,  ...,     0,     0,     0]), ...,
       tensor([ 105, 4421,  132,  ...,    0,    0,    0]),
       tensor([  23, 1093, 6660,  ...,    0,    0,    0]),
       tensor([   3, 6818,   23,  ...,    0,    0,    0])], dtype=object)

In [36]:
# from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
criterion = nn.BCELoss()
epochs = 10

net_w2v = Net(50, 10, 1 , w2v_weights).to(device)
net_ft = Net(50, 10, 1 , ft_weights).to(device)

optimizer_w2v = optim.Adam(net_w2v.parameters(), lr = 0.01, weight_decay = 1e-3)
optimizer_ft = optim.Adam(net_ft.parameters(), lr = 0.01, weight_decay = 1e-3)

### Run loop for Word2Vec ###
run_model_loop(w2v_X.values,w2v_y,2000,0.2,net_w2v,epochs,criterion,optimizer_w2v,"model_testing")


### Run loop for FastText ###
# run_model_loop(ft_X,ft_y,2000,0.2,net_ft,epochs,criterion,optimizer_ft,"model_testing")

TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray

In [92]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''

w2v_weights = torch.FloatTensor(model_w2v.wv.vectors)
input_dim = 28
hidden_dim = 128
layer_dim = 1  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 2
 
GRU_model = LSTM(input_dim, hidden_dim, layer_dim, output_dim,w2v_weights)

#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()
     
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.BCEWithLogitsLoss ()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.01
 

In [93]:
'''
STEP 7: TRAIN THE MODEL
'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
def training_loop(X_input,y_input, data_size, test_p, classifier, epochs, criterion):
    X=X_input[:data_size]
    y = y_input[:data_size]
    y=torch.FloatTensor(y)
    #y=torch.LongTensor([label for label in label_input])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_p, random_state=42)
    optimizer = optim.Adam(classifier.parameters(), lr = 0.01, weight_decay = 1e-3)
    
    train_loss_list = []
    test_loss_list = []
    train_auc_list=[]
    test_auc_list=[]

    j = 0
#     k_folds=5
#     kfold = KFold(n_splits=k_folds, shuffle=True)
#     for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
#         # Sample elements randomly from a given list of ids, no replacement.
#         train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
#         test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

#         # Define data loaders for training and testing data in this fold
#         trainloader = torch.utils.data.DataLoader(
#                           dataset, 
#                           batch_size=10, sampler=train_subsampler)
#         testloader = torch.utils.data.DataLoader(
#                           dataset,
#                           batch_size=10, sampler=test_subsampler) 

    for e in range(epochs):
        train_labels_list = []
        train_prediction_list = []
        test_labels_list = []
        test_prediction_list = []

        running_loss = 0
        classifier.train()

        for i, review in enumerate(X_train):
            
            j+=1
            label = y_train[i].view(1)
            review, label = review.to(device), label.to(device)
            optimizer.zero_grad()
            #use model to predict sentiment
            pred = classifier(review).to(torch.float)
            #to be safe 
            label = label.to(torch.float)
            loss = criterion(pred, label)
            #calculate loss
            running_loss+=loss.item()
            loss.backward()
            optimizer.step()
            train_prediction_list.append(pred.item())
            train_labels_list.append(label.item())
        
#             writer.add_scalar(tag='loss/batch_loss_train', scalar_value=loss, global_step=j)

        epoch_train_loss = running_loss/len(X_train)
        train_loss_list.append(epoch_train_loss)
        fpr_t, tpr_t, thresholds_t = roc_curve(np.array(train_labels_list), np.array(train_prediction_list))
        train_auc = auc(fpr_t, tpr_t)
        train_auc_list.append(train_auc)
        
        test_loss = 0
        classifier.eval()
        # we dont need to update weights, so we define no_grad() to save memory
        with torch.no_grad():
            for i, review in enumerate(X_test):
                label= y_test[i].view(1)
                review, label = review.to(device), label.to(device)
                test_pred = classifier(review).to(torch.float)
                label = label.to(torch.float)
                loss=criterion(test_pred, label)
                test_loss+=loss.item()
                test_prediction_list.append(test_pred.item())
                test_labels_list.append(label.item())
        
        epoch_test_loss = test_loss/len(X_test)
        test_loss_list.append(epoch_test_loss)
        fpr_v, tpr_v, thresholds_v = roc_curve(np.array(test_labels_list),np.array(test_prediction_list))
        test_auc = auc(fpr_v, tpr_v)
        test_auc_list.append(test_auc)
        
        #Tensorboard documentation
        #loss
#         writer.add_scalar(tag='loss/train', scalar_value=epoch_train_loss, global_step=e)
#         writer.add_scalar(tag='loss/test', scalar_value=epoch_test_loss, global_step=e)
#         #auc
#         writer.add_scalar(tag='auc/train', scalar_value=train_auc, global_step=e)
#         writer.add_scalar(tag='auc/test', scalar_value=test_auc, global_step=e)

    return {'train_loss':train_loss_list,
            'train_auc':train_auc_list,
            'test_auc':test_auc_list,
            'test_loss':test_loss_list,
            'fpr_v':fpr_v,
            'tpr_v':tpr_v
            }

In [94]:
train_model=training_loop(w2v_X,w2v_y, 1000, 0.2, GRU_model, 10, criterion)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)