# PyTorch

- CNN Network using PyTorch
- [Ray](https://docs.ray.io/en/latest/tune/index.html) for HyperParameters searching

&rarr; [Experiment 4 - PyTorch](#Experiment-4-(PyTorch))


**Author:** BrenoAV

**Last Date Modified:** 2/4/2024

# Load Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data.csv", sep="\t", encoding="utf-8")

In [3]:
df

Unnamed: 0,sentence,target,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp
2744,Appetite instantly gone.,0,yelp
2745,Overall I was not impressed and would not go b...,0,yelp
2746,"The whole experience was underwhelming, and I ...",0,yelp


## Split dataset

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df["sentence"], df["target"], test_size=0.2, random_state=123)

In [6]:
X_train.shape, y_train.shape

((2198,), (2198,))

In [7]:
X_test.shape, y_test.shape

((550,), (550,))

In [8]:
import os
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Define the tokenizer (English)
NUM_WORDS = 5000
MAX_LEN = 100
tokenizer = get_tokenizer("basic_english")

# Yield list of tokens
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Transform text -> tensor
def text_pipeline(text):
    return vocab(tokenizer(text))

In [9]:
vocab = build_vocab_from_iterator(yield_tokens(X_train), 
                                  specials=["<unk>"], 
                                  max_tokens=NUM_WORDS)
vocab.set_default_index(0)
X_train_encoded = [torch.tensor(text_pipeline(text), dtype=torch.long) for text in X_train]
X_test_encoded = [torch.tensor(text_pipeline(text), dtype=torch.long) for text in X_test]

X_train_encoded = pad_sequence(X_train_encoded, batch_first=True)[:, :MAX_LEN]
X_test_encoded = pad_sequence(X_test_encoded, batch_first=True)[:, :MAX_LEN]

In [10]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 256
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
train_dataset = TensorDataset(X_train_encoded, y_train_tensor)
test_dataset = TensorDataset(X_test_encoded, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# MLFlow

In [11]:
import mlflow
from mlflow.data.pandas_dataset import PandasDataset

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [12]:
dataset: PandasDataset = mlflow.data.from_pandas(df, source="data.csv")

  return _dataset_source_registry.resolve(
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


# Experiment 4 (PyTorch)

In [13]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from ray import tune, train
from ray.tune.search.basic_variant import BasicVariantGenerator
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [14]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, out_channels, kernel_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.cnn = nn.Conv1d(in_channels=MAX_LEN, 
                             out_channels=out_channels,
                             kernel_size=kernel_size,
                             padding=int((kernel_size - 1)/2))
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(int((out_channels * embedding_dim)/2), 1)

    def forward(self, x):
        embedded = self.embedding(x)
        x = F.relu(self.cnn(embedded)).view(embedded.size(0), -1)
        x = self.pool(x)
        output = self.fc(x)
        return torch.sigmoid(output)

In [15]:
def train_func(model, optimizer, train_loader):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Train the model
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
    
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = nn.BCELoss()(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels.unsqueeze(1)).sum().item()
            
        average_loss = total_loss / len(train_loader)
        accuracy = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

        torch.save(model.state_dict(), "./model.pth")

def test_func(model, test_loader):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Evaluate the model
    model.eval()
    y_pred = []
    y_test = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            y_pred.extend(predicted.squeeze().tolist())
            y_test.extend(labels.tolist())
    
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred)
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    
    return metrics

In [16]:
search_space = {
    "out_channels": tune.choice([32, 64, 128]),
    "kernel_size": tune.choice([3, 5]),
    "embedding_dim": tune.choice([50, 75, 100]),
    "lr": tune.choice([1e-2, 1e-3])
}

In [17]:
def train_sentiment_analysis(config):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = Net(vocab_size=len(vocab) + 1,
            embedding_dim=config["embedding_dim"],
            out_channels=config["out_channels"], 
            kernel_size=config["kernel_size"]) 
    model.to(device)

    # optimizer / criterion
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])
    
    # Training
    train_func(model, optimizer, train_loader)

    # Testing
    metrics = test_func(model, test_loader)

    train.report({"acc": metrics["accuracy"]})

In [18]:
trainable_with_gpu = tune.with_resources(train_sentiment_analysis, {"gpu": 1})
# TODO: Create a early stopping version -> https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html
# TODO: Create the checkpoint
tuner = tune.Tuner(trainable_with_gpu, 
                   param_space=search_space, 
                   tune_config=tune.TuneConfig(num_samples=25))
results = tuner.fit()

0,1
Current time:,2024-02-04 17:13:06
Running for:,00:01:02.76
Memory:,12.2/31.2 GiB

Trial name,status,loc,embedding_dim,kernel_size,lr,out_channels,iter,total time (s),acc
train_sentiment_analysis_a9887_00000,TERMINATED,192.168.1.105:106432,50,3,0.01,32,1,2.61052,0.754545
train_sentiment_analysis_a9887_00001,TERMINATED,192.168.1.105:106432,75,5,0.001,32,1,2.44345,0.718182
train_sentiment_analysis_a9887_00002,TERMINATED,192.168.1.105:106432,50,3,0.001,128,1,1.66826,0.738182
train_sentiment_analysis_a9887_00003,TERMINATED,192.168.1.105:106432,50,3,0.001,64,1,1.63661,0.718182
train_sentiment_analysis_a9887_00004,TERMINATED,192.168.1.105:106432,75,3,0.01,128,1,2.4876,0.494545
train_sentiment_analysis_a9887_00005,TERMINATED,192.168.1.105:106432,50,5,0.001,32,1,1.59709,0.709091
train_sentiment_analysis_a9887_00006,TERMINATED,192.168.1.105:106432,50,3,0.001,64,1,1.71176,0.716364
train_sentiment_analysis_a9887_00007,TERMINATED,192.168.1.105:106432,100,3,0.001,128,1,2.33246,0.716364
train_sentiment_analysis_a9887_00008,TERMINATED,192.168.1.105:106432,100,5,0.001,32,1,3.12177,0.721818
train_sentiment_analysis_a9887_00009,TERMINATED,192.168.1.105:106432,50,5,0.01,32,1,1.73966,0.730909


[36m(train_sentiment_analysis pid=106432)[0m Epoch [1/50], Loss: 1.3523, Accuracy: 0.4995
[36m(train_sentiment_analysis pid=106432)[0m Epoch [2/50], Loss: 0.8961, Accuracy: 0.4936
[36m(train_sentiment_analysis pid=106432)[0m Epoch [3/50], Loss: 0.7374, Accuracy: 0.5446
[36m(train_sentiment_analysis pid=106432)[0m Epoch [4/50], Loss: 0.6143, Accuracy: 0.6661
[36m(train_sentiment_analysis pid=106432)[0m Epoch [5/50], Loss: 0.5269, Accuracy: 0.7775
[36m(train_sentiment_analysis pid=106432)[0m Epoch [6/50], Loss: 0.4281, Accuracy: 0.8276
[36m(train_sentiment_analysis pid=106432)[0m Epoch [7/50], Loss: 0.3201, Accuracy: 0.8899
[36m(train_sentiment_analysis pid=106432)[0m Epoch [8/50], Loss: 0.2152, Accuracy: 0.9386
[36m(train_sentiment_analysis pid=106432)[0m Epoch [9/50], Loss: 0.1356, Accuracy: 0.9704
[36m(train_sentiment_analysis pid=106432)[0m Epoch [10/50], Loss: 0.0777, Accuracy: 0.9873
[36m(train_sentiment_analysis pid=106432)[0m Epoch [11/50], Loss: 0.0450, Acc

[36m(train_sentiment_analysis pid=106432)[0m   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[36m(train_sentiment_analysis pid=106432)[0m Epoch [1/50], Loss: 5.6598, Accuracy: 0.5141
[36m(train_sentiment_analysis pid=106432)[0m Epoch [2/50], Loss: 1.4325, Accuracy: 0.4809
[36m(train_sentiment_analysis pid=106432)[0m Epoch [3/50], Loss: 0.8772, Accuracy: 0.4986
[36m(train_sentiment_analysis pid=106432)[0m Epoch [4/50], Loss: 0.7357, Accuracy: 0.5259
[36m(train_sentiment_analysis pid=106432)[0m Epoch [5/50], Loss: 0.6216, Accuracy: 0.6465
[36m(train_sentiment_analysis pid=106432)[0m Epoch [6/50], Loss: 0.5101, Accuracy: 0.8258
[36m(train_sentiment_analysis pid=106432)[0m Epoch [7/50], Loss: 0.3963, Accuracy: 0.8803
[36m(train_sentiment_analysis pid=106432)[0m Epoch [8/50], Loss: 0.2821, Accuracy: 0.9222
[36m(train_sentiment_analysis pid=106432)[0m Epoch [9/50], Loss: 0.1805, Accuracy: 0.9509
[36m(train_sentiment_analysis pid=106432)[0m Epoch [10/50], Loss: 0.1059, Accuracy: 0.9759
[36m(train_sentiment_analysis pid=106432)[0m Epoch [11/50], Loss: 0.0653, Acc

2024-02-04 17:13:06,600	INFO tune.py:1042 -- Total run time: 63.04 seconds (62.75 seconds for the tuning loop).


[36m(train_sentiment_analysis pid=106432)[0m Epoch [49/50], Loss: 0.0125, Accuracy: 1.0000
[36m(train_sentiment_analysis pid=106432)[0m Epoch [50/50], Loss: 0.0121, Accuracy: 1.0000


In [19]:
results.get_dataframe()

Unnamed: 0,acc,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/out_channels,config/kernel_size,config/embedding_dim,config/lr,logdir
0,0.754545,1707077533,,False,1,a9887_00000,2024-02-04_17-12-13,2.610516,2.610516,106432,mint-desktop,192.168.1.105,2.610516,1,32,3,50,0.01,a9887_00000
1,0.718182,1707077535,,False,1,a9887_00001,2024-02-04_17-12-15,2.443449,2.443449,106432,mint-desktop,192.168.1.105,2.443449,1,32,5,75,0.001,a9887_00001
2,0.738182,1707077537,,False,1,a9887_00002,2024-02-04_17-12-17,1.668264,1.668264,106432,mint-desktop,192.168.1.105,1.668264,1,128,3,50,0.001,a9887_00002
3,0.718182,1707077538,,False,1,a9887_00003,2024-02-04_17-12-18,1.636613,1.636613,106432,mint-desktop,192.168.1.105,1.636613,1,64,3,50,0.001,a9887_00003
4,0.494545,1707077541,,False,1,a9887_00004,2024-02-04_17-12-21,2.4876,2.4876,106432,mint-desktop,192.168.1.105,2.4876,1,128,3,75,0.01,a9887_00004
5,0.709091,1707077543,,False,1,a9887_00005,2024-02-04_17-12-23,1.597095,1.597095,106432,mint-desktop,192.168.1.105,1.597095,1,32,5,50,0.001,a9887_00005
6,0.716364,1707077544,,False,1,a9887_00006,2024-02-04_17-12-24,1.71176,1.71176,106432,mint-desktop,192.168.1.105,1.71176,1,64,3,50,0.001,a9887_00006
7,0.716364,1707077547,,False,1,a9887_00007,2024-02-04_17-12-27,2.332456,2.332456,106432,mint-desktop,192.168.1.105,2.332456,1,128,3,100,0.001,a9887_00007
8,0.721818,1707077550,,False,1,a9887_00008,2024-02-04_17-12-30,3.121769,3.121769,106432,mint-desktop,192.168.1.105,3.121769,1,32,5,100,0.001,a9887_00008
9,0.730909,1707077552,,False,1,a9887_00009,2024-02-04_17-12-32,1.739657,1.739657,106432,mint-desktop,192.168.1.105,1.739657,1,32,5,50,0.01,a9887_00009


In [20]:
experiment_name = "sentiment_analysis_torch_cnn"

experiment_tags = {
    "nlp.framework": "PyTorch",
    "nlp.encoding": "Tokenizer",
    "nlp.model": "CNN Network",
    "nlp.task": "Sentiment Analysis"
}

mlflow.create_experiment(name=experiment_name,
                         artifact_location="mlartifacts",
                         tags=experiment_tags)

'868307336683898676'

In [21]:
mlflow.set_experiment(experiment_name=experiment_name)  # It could be use the ID too

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train the model
best_hps = results.get_best_result("acc", mode="max").config
logdir = results.get_best_result("acc", mode="max").path

model = Net(vocab_size=len(vocab) + 1,
            embedding_dim=best_hps["embedding_dim"],
            out_channels=best_hps["out_channels"], 
            kernel_size=best_hps["kernel_size"])
model.to(device)
state_dict = torch.load(os.path.join(logdir, "model.pth"))
model.load_state_dict(state_dict)
metrics = test_func(model=model, test_loader=test_loader)

run_name = "_".join([f"{k}_{v}" for k, v in best_hps.items()])

with mlflow.start_run(run_name=run_name):
    # Log the hyperparameters
    mlflow.log_params(best_hps)

    # Log the metrics
    mlflow.log_metrics(metrics)

    # Log the dataset
    mlflow.log_input(dataset, context="training")

    # Log the model
    mlflow.pytorch.log_model(pytorch_model=model, 
                             artifact_path="mlartifacts", 
                             input_example=X_train_encoded.numpy())
    
    mlflow.log_artifact("tokenizer.pkl", artifact_path="mlartifacts")



This Jupyter Notebook was **created by BrenoAV**. For any inquiries or feedback, please feel free to create an issue on [GitHub](https://github.com/BrenoAV/NLP-Sentiment-Analysis/issues) 📣.