In [41]:
import mlflow
import git
import dvc.api
import numpy as np
from string import punctuation
from collections import Counter
import pickle
from spacy.lang.en import STOP_WORDS
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from tqdm import tqdm

In [2]:
os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your DAGsHub username: ')
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = input('Enter your DAGsHub project name: ')

Enter your DAGsHub username:  elshehawy
Enter your DAGsHub access token:  ········································
Enter your DAGsHub project name:  sentiment-analysis


In [3]:
mlflow.set_tracking_uri('https://dagshub.com/elshehawy/sentiment-analysis.mlflow')

In [4]:
labels_path = './data/labels.txt'
reviews_path = './data/reviews.txt'

with open(labels_path, 'r') as f:
    labels = f.read()
with open(reviews_path, 'r') as f:
    reviews = f.read()

In [5]:
print(reviews[:2000])
print()
print(labels[:26])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [6]:
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [7]:
import git
import dvc.api

In [12]:
mlflow.set_experiment('process data')
with mlflow.start_run(run_name="create words"):
    reviews = reviews.lower()
    
    all_text = ''.join([c for c in reviews if c not in punctuation])
    reviews_split = all_text.split('\n')
    all_text = ' '.join(reviews_split)
    
    with open(reviews_path, 'w') as f:
        f.write(all_text)
        
    mlflow.log_param("operation", 'split and remove punctuation')
    mlflow.log_param("requested version", 'v1')
    mlflow.log_param('version', 'v2')
    mlflow.log_param('number of characters', len(all_text))

INFO: 'process data' does not exist. Creating a new experiment


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
from spacy.lang.en import STOP_WORDS

2021-12-07 20:20:46.443611: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-07 20:20:46.443656: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [15]:
from collections import Counter
import pickle

In [17]:
with mlflow.start_run(run_name="create vocab_to_int file"):
    with open(reviews_path, 'r') as f:
        all_text = f.read()
    
    words = all_text.split()
    words = [word for word in words if word not in STOP_WORDS]
    counter = Counter(words)
    vocab = sorted(counter, key=counter.get, reverse=True)
    # Build a dictionary that maps words to integers
    vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}    
    
    file_name = './data/vocab_to_int.sav'
    pickle.dump(vocab_to_int, open(file_name, 'wb'))
        
    mlflow.log_param("operation", 'create vocab to int file')
    mlflow.log_param("requested version", 'v2')
    mlflow.log_param('version', 'vocab_v1')
    mlflow.log_param('number of characters', 'N/A')

In [26]:
with mlflow.start_run(run_name="tokenize reviews"):  
    ## use the dict to tokenize each review in reviews_split
    ## store the tokenized reviews in reviews_ints
    reviews_ints = []
    for review in reviews_split:
        reviews_ints.append([vocab_to_int[word] for word in review.split() if word not in STOP_WORDS])
    
    mlflow.log_param("operation", 'tokenize reviews')
    mlflow.log_param("requested version", 'N/A')
    mlflow.log_param('version', 'rev_tok_v1')
    mlflow.log_param('number of characters', len(reviews_ints))

In [27]:
with mlflow.start_run(run_name="encode labels"):
    
    labels_split = labels.split('\n')
    encoded_labels = np.array([1 if label=='positive' else 0 for label in labels_split])
        
    mlflow.log_param("operation", 'encode labels')
    mlflow.log_param("requested version", 'v1')
    mlflow.log_param('version', 'v2')
    mlflow.log_param('number of characters', len(encoded_labels))

In [34]:
with mlflow.start_run(run_name="remove outliers"):

    review_lens = Counter([len(x) for x in reviews_ints])
    non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) !=0]
    reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
    encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])
        
    mlflow.log_param("operation", 'remove outliers')
    mlflow.log_param("requested version", 'v2')
    mlflow.log_param('version', 'v3')
    mlflow.log_param('number of characters', len(encoded_labels))

In [35]:
def pad_features(reviews_ints, seq_length):
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    
    for i, review in enumerate(reviews_ints):
        features[i, -len(review):] = np.array(review)[:seq_length]
    
    return features

In [38]:
from sklearn.model_selection_selection import train_test_split 

In [39]:
with mlflow.start_run(run_name="pad features"):

    seq_length = 200 
    features = pad_features(reviews_ints, seq_length=seq_length) 
    split_frac = 0.2
    ## split data into training, validation, and test data (features and labels, x and y)
    X = features
    y = encoded_labels
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_frac, shuffle=True,random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, shuffle=True, random_state=42, stratify=y_val)
        
    mlflow.log_param("operation", 'pad featues')
    mlflow.log_param("requested version", 'v3')
    mlflow.log_param('version', 'v4')
    mlflow.log_param('number of characters', '0.8, 0.1, 0.1')

In [42]:
with mlflow.start_run(run_name="build datasets"):
    # create Tensor datasets
    train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
    valid_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
    test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
    # dataloaders
    batch_size = 64
    # make sure to SHUFFLE your data
    test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    
    mlflow.log_param("operation", 'buil datasets')
    mlflow.log_param("requested version", 'v4')
    mlflow.log_param('version', 'v5')
    mlflow.log_param('number of characters', '0.8, 0.1, 0.1')

### build the model

In [44]:
mlflow.set_experiment('build the model')

INFO: 'build the model' does not exist. Creating a new experiment


In [45]:
class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # define all layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        sig_out = self.sig(out)
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                     weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                     weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [46]:
def train(model, n_epochs, optimizer, criterion, train_on_gpu, save_path, batch_size=64, print_every=10, clip=5):
    print('Start Training on "{}" for {} epochs...'.format('GPU' if train_on_gpu else 'cpu', n_epochs))
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
    # move model to GPU, if available
    if(train_on_gpu):
        net.cuda()

   
    val_loss_min = np.Inf
    # train for some number of epochs
    for e in range(n_epochs):
        print('epoch:', e+1,'train...')
        # initialize hidden state
        h = model.init_hidden(batch_size)
        
        train_loss = 0
        val_loss = 0
        
        train_total = 0
        val_total = 0
        # batch loop
        net.train()
        n_batches = len(train_loader.dataset) // batch_size
        i = 1
        for inputs, labels in tqdm(train_loader):
            if i > n_batches:
                break
            i+=1
            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()

            # get the output from the model
            output, h = model(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            
            train_loss += loss.item() * len(labels)
            train_total += len(labels)
            
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()

        ############################## VALIDATION #################################
        print('validation...')
        val_h = net.init_hidden(batch_size)
        net.eval()
        n_batches = len(valid_loader.dataset) // batch_size
        i = 1
        for inputs, labels in tqdm(valid_loader):
            if i > n_batches:
                break
            i += 1

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            val_h = tuple([each.data for each in val_h])

            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            output, val_h = net(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_loss +=  val_loss.item() * len(labels)
            val_total += len(labels)
            
        train_loss = train_loss / train_total
        val_loss = val_loss / val_total
        if e % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, n_epochs),
#                   "Step: {}...".format(counter),
                  "Train Loss: {:.6f}...".format(train_loss),
                  "Val Loss: {:.6f}".format(val_loss))
            
        if val_loss < val_loss_min:
            print('Validation loss decreased from: {:.6f}, to: {:.6f}\tSAVING MODEL... in Epoch: {}\n'.format(val_loss_min, val_loss, e+1))
            
            # save the model
            torch.save(model.state_dict(), save_path)
            
            # update minimum val loss
            val_loss_min = val_loss

In [47]:
with mlflow.start_run(run_name="model structure"):

    vocab_size = len(vocab_to_int) + 1 # +1 for 0 padding
    output_size = 1 # pos, or neg
    embedding_dim = 100
    hidden_dim = 128
    n_layers = 2
    net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
    
    mlflow.log_param("operation", 'instantiate the model')
    mlflow.log_param("embedding_dim", embedding_dim)
    mlflow.log_param('hidden_dm', hidden_dim)
    mlflow.log_param('lstm_layers', n_layers)