In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import gc
import numpy as np
from gensim.models import KeyedVectors

In [2]:
class wordVector:
    def __init__(self,vector,index = 0,key = '<pad>'):
        self.index = index
        self.key = key
        self.vector = vector
def get_embeddings(wv_objs):
    temp = []
    for objs in wv_objs.values():
        temp.append(objs.vector)
    return np.array(temp)



In [3]:
def encode_Lables(label):
    temp = np.zeros(5)
    temp[label-1] = 1
    return temp
def splitsentences(sentences):
    splits = []
    for x in range(len(sentences)):
        splits.append(sentences[x].split())
    return splits
def encodeSentence(sentence,embeds,max):
    encoding = np.zeros(max)
    for i in range(len(encoding)):
        index = 0
        if i < len(sentence):
            try:
                index = embeds[sentence[i]].index
            except:
                index = 0
        encoding[i] = index
    return encoding.astype(int)    
def encodeSentences(data,embeds,max):
    splitted_sentences = splitsentences(data)
    encoded_Sentences = []
    for sentence in splitted_sentences:
        encoded_Sentences.append(encodeSentence(sentence,embeds,max))
    return np.array(encoded_Sentences)
def shrinkEmbeds(corpus,embeds):
    words_index = dict()
    words_index['<pad>'] = wordVector(embeds[0],0,'<pad>')
    i = 1
    for word in corpus:
        if word not in words_index:
            try:
                words_index[word] = wordVector(embeds[word],i,word)
                i += 1
            except: 
                pass 
    return words_index
def prep_data(filename,samp_size,embeds):
  #reading Data
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    path = parent_dir +"/model_data/" + filename
    df = pd.read_csv(path)

    #Sampling data
    if samp_size > 1:
        samp_size = 1
    if samp_size <=0.1:
      samp_size = .1
    df = df.sample(n=int(len(df) * samp_size))
    df.reset_index(inplace=True)
    #preping features
    vectorizer = TfidfVectorizer(min_df=1,max_df=.95)
    corpus = vectorizer.fit(df["Review_text"]).get_feature_names_out()
    embeds = shrinkEmbeds(corpus,embeds)
    mean = int(df["Review_text"].str.split().str.len().mean())

    X = encodeSentences(df["Review_text"],embeds,mean)
    #del vectorizer,text_vector
    #gc.collect()

    #preping labels
    Y = df["Rating"].astype(int).apply(encode_Lables)
    
    
    return X,Y,embeds


In [4]:
class RNNClassifier(torch.nn.Module):
    def __init__(self,embeddings,vocabSize,embedsize, hidden_size,dropout,num_layers,device):
        super(RNNClassifier, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.device= device
        self.embeds = torch.nn.Embedding(vocabSize,embedsize,_freeze=False,device=device)
        self.embeds.weight = torch.nn.Parameter(torch.from_numpy(embeddings))
        self.rnn = torch.nn.RNN(embedsize, hidden_size,num_layers,dropout= dropout,nonlinearity="relu", batch_first=True)
        
        self.fc = torch.nn.Linear(hidden_size, 5)
        self.softMax = torch.nn.LogSoftmax(dim = -1)
        

    def forward(self, x):
        input = self.embeds(x).to(torch.float32)
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size, device= self.device)
        out, _ = self.rnn(input,h0)
      
        out = out[:, -1, :]
       
        out = self.fc(out)

        return out

In [5]:
current_dir = os.getcwd() + '//embeds//'
embeddings = KeyedVectors.load_word2vec_format(current_dir+'GoogleNews-vectors-negative300.bin.gz',binary=True)

In [6]:
#hyper paramaters that can be changed
batch_size = 32
n_layers = 2
epochs = 10
sample_size = .10
dropout = .20
decay = .001
chosen_lr = .005
chosen_momentum = .01
hidden_size = 20
device = "cpu"
torch.set_default_dtype(torch.float32)
gpu_available = torch.cuda.is_available()
if gpu_available:
    device = 'cuda'


filename = "data_set_1.csv"
X_data,Y_data,embeddings = prep_data(filename,sample_size,embeddings)


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\taldan00\\Documents\\MLFP_DataAnalysis_and_DataPrep\\Code/model_data/data_set_1.csv'

In [None]:
embeddings = get_embeddings(embeddings)

In [None]:

vocab_size = len(embeddings)
embed_size = 300

In [None]:

train_x,test_x,train_y,test_y = train_test_split(X_data,Y_data,test_size=.20,random_state=64)


In [None]:
train_y = np.stack(train_y.to_numpy())
train_x = train_x.astype(np.int32)

val_y = np.stack(test_y.to_numpy())
val_x = test_x.astype(np.int32)


train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)

val_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,pin_memory=True,num_workers=4)

In [None]:
model = RNNClassifier(embeddings,vocab_size,embed_size, hidden_size,dropout,n_layers,device)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=chosen_lr,weight_decay=decay,momentum=chosen_momentum)

In [None]:
def sumCorrect(predictions,labels):
    sum = 0
    for x in range(len(predictions)):
        prediction = predictions[x]
        yhat = torch.argmax(prediction)
        correct = labels[x]
        y = torch.argmax(correct)
        if  yhat == y:
            sum += 1
    return sum
model.to(device)
printNum = int((len(train_dataset)/batch_size)*.2)

model.train()
#Training Loop
for epoch in range(epochs):
    BatchCounter = 0
    for text,lables in train_dataloader:
        BatchCounter += 1

        text = text.to(device)
        lables = lables.to(device)

        optimizer.zero_grad()
        
        yhat = model(text)
        loss = criterion(yhat,lables)

        loss.backward()
        optimizer.step()
        if BatchCounter % printNum == 0:
            totalLoss = 0 
            totalcorrect = 0
            model.eval()
            with torch.no_grad():
                for text,lables in val_dataloader:
                    text = text.to(device)
                 
                    lables = lables.to(device)
                 
                    yhat = model(text)
                    val_loss = criterion(yhat,lables)
                    totalcorrect += sumCorrect(yhat,lables)
                    totalLoss += val_loss.item() * len(val_dataloader)
            model.train()
            accuracy = totalcorrect/len(val_dataset)
            print("Epoch:  {0} Batch:   {1}   Loss: {2: .4f} Val Loss:   {3: .4f}  Val Accuracy: {4: .2%}".format(epoch + 1,BatchCounter,loss.item(),totalLoss/len(val_dataset),accuracy))

#Results

with torch.no_grad():
    totalLoss = 0
    totalcorrect = 0
    for text,lables in val_dataloader:
       
        text = text.to(device)
        lables = lables.to(device)
        
        yhat = model(text)
        loss = criterion(yhat,lables)
        totalcorrect += sumCorrect(yhat,lables)
        totalLoss+=loss.item()*len(val_dataloader)
    accuracy = totalcorrect/len(val_dataset)     
    print("The total Loss is: {:.3}".format(totalLoss/len(val_dataset)))
    print("The Accuracy of the model is: {:.2%}".format(accuracy))   