In [21]:
import numpy as np
import mlflow
import os
from getpass import getpass

In [2]:
os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your DAGsHub username: ')
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = input('Enter your DAGsHub project name: ')

Enter your DAGsHub username:  elshehawy
Enter your DAGsHub access token:  ········································
Enter your DAGsHub project name:  sentiment-analysis


In [3]:
mlflow.set_tracking_uri('https://dagshub.com/elshehawy/sentiment-analysis.mlflow')

In [4]:
labels_path = './data/labels.txt'
reviews_path = './data/reviews.txt'

with open(labels_path, 'r') as f:
    labels = f.read()
with open(reviews_path, 'r') as f:
    reviews = f.read()

In [5]:
print(reviews[:2000])
print()
print(labels[:26])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [6]:
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [7]:
import git
import dvc.api

In [12]:
mlflow.set_experiment('process data')
with mlflow.start_run(run_name="create words"):
    reviews = reviews.lower()
    
    all_text = ''.join([c for c in reviews if c not in punctuation])
    reviews_split = all_text.split('\n')
    all_text = ' '.join(reviews_split)
    
    with open(reviews_path, 'w') as f:
        f.write(all_text)
        
    mlflow.log_param("operation", 'split and remove punctuation')
    mlflow.log_param("requested version", 'v1')
    mlflow.log_param('version', 'v2')
    mlflow.log_param('number of characters', len(all_text))

INFO: 'process data' does not exist. Creating a new experiment


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
from spacy.lang.en import STOP_WORDS

2021-12-07 20:20:46.443611: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-07 20:20:46.443656: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [15]:
from collections import Counter
import pickle

In [17]:
with mlflow.start_run(run_name="create vocab_to_int file"):
    with open(reviews_path, 'r') as f:
        all_text = f.read()
    
    words = all_text.split()
    words = [word for word in words if word not in STOP_WORDS]
    counter = Counter(words)
    vocab = sorted(counter, key=counter.get, reverse=True)
    # Build a dictionary that maps words to integers
    vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}    
    
    file_name = './data/vocab_to_int.sav'
    pickle.dump(vocab_to_int, open(file_name, 'wb'))
        
    mlflow.log_param("operation", 'create vocab to int file')
    mlflow.log_param("requested version", 'v2')
    mlflow.log_param('version', 'vocab_v1')
    mlflow.log_param('number of characters', 'N/A')

In [None]:
with mlflow.start_run(run_name="tokenize reviews"):  
    ## use the dict to tokenize each review in reviews_split
    ## store the tokenized reviews in reviews_ints
    reviews_ints = []
    for review in reviews_split:
        reviews_ints.append([vocab_to_int[word] for word in review.split() if word not in STOP_WORDS])
    
    mlflow.log_param("operation", 'tokenize reviews')
    mlflow.log_param("requested version", 'N/A')
    mlflow.log_param('version', 'rev_tok_v1')
    mlflow.log_param('number of characters', len(reviews_ints))

In [25]:
with mlflow.start_run(run_name="encode labels"):
    
    labels_split = labels.split('\n')
    encoded_labels = np.array([1 if label=='positive' else 0 for label in labels_split])
        
    mlflow.log_param("operation", 'encode labels')
    mlflow.log_param("requested version", 'v1')
    mlflow.log_param('version', 'v2')
    mlflow.log_param('number of characters', len(encoded_labels))