In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import precision_recall_fscore_support,accuracy_score,classification_report
import os
import pandas as pd
import gc
import numpy as np
from gensim.models import KeyedVectors

In [None]:
class wordVector:
    def __init__(self,vector,index = 0,key = '<pad>'):
        self.index = index
        self.key = key
        self.vector = vector
def get_embeddings(wv_objs):
    temp = []
    for objs in wv_objs.values():
        temp.append(objs.vector)
    return np.array(temp)



In [None]:
def encode_Train_Lables(label):
    temp = np.zeros(5)
    temp[label-1] = 1
    return temp
def encode_Test_Lables(label):
    return int(label-1)

def splitsentences(sentences):
    splits = []
    for x in range(len(sentences)):
        splits.append(sentences[x].split())
    return splits
def encodeSentence(sentence,embeds,max):
    encoding = np.zeros(max)
    for i in range(len(encoding)):
        index = 0
        if i < len(sentence):
            try:
                index = embeds[sentence[i]].index
            except:
                index = 0
        encoding[i] = index
    return encoding.astype(int)    
def encodeSentences(data,embeds,max):
    splitted_sentences = splitsentences(data)
    encoded_Sentences = []
    for sentence in splitted_sentences:
        encoded_Sentences.append(encodeSentence(sentence,embeds,max))
    return np.array(encoded_Sentences)
def shrinkEmbeds(corpus,embeds):
    words_index = dict()
    words_index['<pad>'] = wordVector(embeds[0],0,'<pad>')
    i = 1
    for word in corpus:
        if word not in words_index:
            try:
                words_index[word] = wordVector(embeds[word],i,word)
                i += 1
            except: 
                pass 
    return words_index
def prep_data(filename,samp_size,embeds,min_df,max_df):
  #reading Data
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    path = parent_dir +"/model_data/" + filename
    df = pd.read_csv(path)

    #Sampling data
    if samp_size > 1:
        samp_size = 1
    if samp_size <=0.1:
      samp_size = .1
    df = df.sample(n=int(len(df) * samp_size))
    df.reset_index(inplace=True)
    #preping features
    vectorizer = TfidfVectorizer(min_df=min_df,max_df=max_df)
    corpus = vectorizer.fit(df["Review_text"]).get_feature_names_out()
    embeds = shrinkEmbeds(corpus,embeds)
    mean = int(df["Review_text"].str.split().str.len().mean())

    X = encodeSentences(df["Review_text"],embeds,mean)
    #del vectorizer,text_vector
    #gc.collect()

    #preping labels
    Y = df["Rating"]
    
    
    return X,Y,embeds


In [None]:
class RNNClassifier(torch.nn.Module):
    def __init__(self,embeddings,vocabSize,embedsize, hidden_size,dropout,num_layers,device):
        super(RNNClassifier, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.device= device
        self.embeds = torch.nn.Embedding(vocabSize,embedsize,_freeze=False,device=device)
        self.embeds.weight = torch.nn.Parameter(torch.from_numpy(embeddings))
        self.rnn = torch.nn.RNN(embedsize, hidden_size,num_layers,dropout= dropout,nonlinearity="relu", batch_first=True,device=device)
        
        self.fc = torch.nn.Linear(hidden_size, 5)
        self.softMax = torch.nn.LogSoftmax(dim = -1)
        

    def forward(self, x):
        input = self.embeds(x).to(torch.float32)
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size, device= self.device)
        out, _ = self.rnn(input,h0)
      
        out = out[:, -1, :]
       
        out = self.fc(out)

        return out

In [None]:
current_dir = os.getcwd() + '//embeds//'
embeddings = KeyedVectors.load_word2vec_format(current_dir+'GoogleNews-vectors-negative300.bin.gz',binary=True)

In [None]:
#hyper paramaters that can be changed
batch_size = 50
n_layers = 2
epochs = 15
sample_size = .25
dropout = .05
decay = .001
chosen_lr = .00001
chosen_momentum = .003
hidden_size = 20
min_df = .01
max_df = .85
device = "cpu"
torch.set_default_dtype(torch.float32)
gpu_available = torch.cuda.is_available()
if gpu_available:
    device = 'cuda'


filename = "data_set_1.csv"
X_data,Y_data,embeddings = prep_data(filename,sample_size,embeddings,min_df,max_df)


In [None]:
embeddings = get_embeddings(embeddings)

In [None]:

vocab_size = len(embeddings)
embed_size = 300

In [None]:
# Evaluation method
def Evaluate(model,X_data,Y_data,epochs):
    kfold = KFold(shuffle=True)
    for fold, (train_ids, test_ids) in enumerate(kfold.split(X_data,Y_data)):
        print(f"FOLD {fold+1}")

        train_x = X_data[train_ids]
        train_y = Y_data.iloc[train_ids].astype(int).apply(encode_Train_Lables).to_numpy()

        val_x = X_data[test_ids]
        val_y = Y_data.iloc[test_ids].apply(encode_Test_Lables).to_numpy()
        
        train_y = np.stack(train_y)
      
        train_x = train_x.astype(np.int32)
        
        val_x = val_x.astype(np.int32)
        '''
        val_y = np.stack(val_y)
        
        
        '''

        train_dataset = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)

        val_dataset = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=chosen_lr,weight_decay=decay,momentum=chosen_momentum)

        #Training Loop
        for epoch in range(epochs):
            
            for text,lables in train_dataloader:
                text = text.to(device)
                lables = lables.to(device)

                optimizer.zero_grad()
        
                yhat = model(text)
                loss = criterion(yhat,lables)
                loss.backward()
                optimizer.step()
        #Results
        with torch.no_grad():
            totalLoss = 0
            preds = []
            targets = []
            for text,lables in val_dataloader:
       
                text = text.to(device)
                lables = lables.to(device)
        
                yhat = model(text)

                loss = criterion(yhat,lables)

                yhat = yhat.cpu()
                lables = lables.cpu()
                
                preds.extend(torch.argmax(yhat,dim=1).numpy())
                targets.extend(lables.numpy())

                
                totalLoss+=loss.item()
    
            print("Avg Total Loss: {0:.2f}".format(totalLoss/len(val_dataloader)))
            print(classification_report(targets,preds,zero_division=0.0))

In [None]:
model = RNNClassifier(embeddings,vocab_size,embed_size,hidden_size,dropout,n_layers,device)
model.to(device)

In [None]:
#Evaluation
Evaluate(model,X_data,Y_data,epochs)

In [None]:
train_x,test_x,train_y,test_y = train_test_split(X_data,Y_data,test_size=.20)
train_y = np.stack(train_y.astype(int).apply(encode_Train_Lables))
train_x = train_x.astype(np.int32)

val_y = test_y.apply(encode_Test_Lables).to_numpy()
val_x = test_x.astype(np.int32)

model = RNNClassifier(embeddings,vocab_size,embed_size,hidden_size,dropout,n_layers,device)
train_dataset = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)

model.to(device)
val_dataset = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=chosen_lr,weight_decay=decay,momentum=chosen_momentum)



In [None]:
def getMetrics(targets,preds):

    precision, recall, f1, _ = precision_recall_fscore_support(targets, preds, average='micro',zero_division=0.0)
    acc = accuracy_score(targets, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



In [None]:

def getMetrics(targets,preds):

    precision, recall, f1, _ = precision_recall_fscore_support(targets, preds, average='macro',zero_division=0.0)
    acc = accuracy_score(targets, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

#Training Loop
for epoch in range(epochs):
    if epoch > 0:
        totalLoss = 0 
        totalcorrect = 0
        model.eval()
        with torch.no_grad():
            preds = []
            targets = []
            for text,lables in val_dataloader:
                text = text.to(device)
                lables = lables.to(device)
                 
                yhat = model(text)
                
                val_loss = criterion(yhat,lables)
                
                yhat = yhat.cpu()
                lables = lables.cpu()

                preds.extend(torch.argmax(yhat,dim=1).numpy())
                targets.extend(lables.numpy())

                totalLoss += val_loss.item() * len(val_dataloader)
                
            results = getMetrics(targets,preds)
            del preds,targets
            gc.collect()    
            print("Epoch:  {0}   Loss: {1: .4f} Val Loss:   {2: .4f}  Val Accuracy: {3: .2%} Precision: {4: .2f} Recall: {5: .2f} f1 Score: {6: .2f}".format(epoch,loss.item(),totalLoss/len(val_dataset),results['accuracy'],results['precision'],results['recall'],results['f1']))
    model.train()    
    for text,lables in train_dataloader:
        text = text.to(device)
        lables = lables.to(device)

        optimizer.zero_grad()
        
        yhat = model(text)
        loss = criterion(yhat,lables)

        loss.backward()
        optimizer.step()
            

  

In [None]:
''''
#Results

with torch.no_grad():
    totalLoss = 0
    
    preds = []
    targets = []
    for text,lables in val_dataloader:
       
        text = text.to(device)
        lables = lables.to(device)
        
        yhat = model(text)

        preds.extend(torch.argmax(yhat,dim=1).numpy())
        targets.extend(lables.numpy())

        loss = criterion(yhat,lables)
        totalLoss+=loss.item()*len(val_dataloader)
    
    print("Total Loss: {0:.2f}".format(totalLoss))
    print(classification_report(targets,preds,zero_division=0.0))
    '''