# PyTorch

In [1]:
# TODO: Random Search

# Load Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data.csv", sep="\t", encoding="utf-8")

In [3]:
df = df[df["source"] == "yelp"]

In [4]:
df

Unnamed: 0,sentence,target,source
1748,Wow... Loved this place.,1,yelp
1749,Crust is not good.,0,yelp
1750,Not tasty and the texture was just nasty.,0,yelp
1751,Stopped by during the late May bank holiday of...,1,yelp
1752,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp
2744,Appetite instantly gone.,0,yelp
2745,Overall I was not impressed and would not go b...,0,yelp
2746,"The whole experience was underwhelming, and I ...",0,yelp


## Split dataset

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df["sentence"], df["target"], test_size=0.2, random_state=123)

In [7]:
X_train.shape, y_train.shape

((800,), (800,))

In [8]:
X_test.shape, y_test.shape

((200,), (200,))

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

NUM_WORDS = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(X_train)  # Important be only the train data!!!

X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_test_encoded = tokenizer.texts_to_sequences(X_test)

X_train_encoded = pad_sequences(X_train_encoded, maxlen=MAX_LEN, padding="post")
X_test_encoded = pad_sequences(X_test_encoded, maxlen=MAX_LEN, padding="post")




In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 256
X_train_tensor = torch.tensor(X_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_encoded, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# MLFlow

In [11]:
import mlflow
from mlflow.data.pandas_dataset import PandasDataset

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [12]:
dataset: PandasDataset = mlflow.data.from_pandas(df, source="data.csv")

  return _dataset_source_registry.resolve(


# Experiment 4 (PyTorch)

In [13]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [14]:
class Net(nn.Module):
    def __init__(self, vocab_size, out_channels, kernel_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 50)
        self.cnn = nn.Conv1d(in_channels=MAX_LEN, 
                             out_channels=out_channels,
                             kernel_size=kernel_size,
                             padding=int((kernel_size - 1)/2))
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(int((out_channels * 50)/2), 1)

    def forward(self, x):
        embedded = self.embedding(x)
        x = F.relu(self.cnn(embedded)).view(embedded.size(0), -1)
        x = self.pool(x)
        output = self.fc(x)
        return torch.sigmoid(output)

In [15]:
experiment_name = "sentiment_analysis_torch_cnn"

experiment_tags = {
    "nlp.framework": "PyTorch",
    "nlp.encoding": "Tokenizer",
    "nlp.model": "CNN Network",
    "nlp.task": "Sentiment Analysis"
}

mlflow.create_experiment(name=experiment_name,
                         artifact_location="mlartifacts",
                         tags=experiment_tags)

'737991649699542434'

In [16]:
mlflow.set_experiment(experiment_name=experiment_name)  # It could be use the ID too

params_list = [
    {
        "out_channels": 32,
        "kernel_size": 5,
        "epochs": 10
    },
    {
        "out_channels": 128,
        "kernel_size": 3,
        "epochs": 40
    },
    {
        "out_channels": 128,
        "kernel_size": 5,
        "epochs": 60
    }
]

for i, params in enumerate(params_list):

    model = Net(vocab_size=len(tokenizer.index_word) + 1, 
                out_channels=params["out_channels"], 
                kernel_size=params["kernel_size"])
    print(model)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    num_epochs = params["epochs"]
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels.unsqueeze(1)).sum().item()
            
        average_loss = total_loss / len(train_loader)
        accuracy = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Evaluate the model
    model.eval()
    y_pred = []
    y_test = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            y_pred.extend(predicted.squeeze().tolist())
            y_test.extend(labels.tolist())
    
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred)
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    
    run_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    
    with mlflow.start_run(run_name=run_name):
        # Log the hyperparameters
        mlflow.log_params(params)
    
        # Log the metrics
        mlflow.log_metrics(metrics)

        # Log the dataset
        mlflow.log_input(dataset, context="training")

    
        # Log the model
        mlflow.pytorch.save_model(pytorch_model=model, 
                                  path=f"models/yelp_model_torch_{i}", 
                                  input_example=X_train_encoded)

        mlflow.log_artifact(f"models/yelp_model_torch_{i}", artifact_path="mlartifacts")

Net(
  (embedding): Embedding(1807, 50)
  (cnn): Conv1d(100, 32, kernel_size=(5,), stride=(1,), padding=(2,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=800, out_features=1, bias=True)
)
Epoch [1/10], Loss: 0.7173, Accuracy: 0.5162
Epoch [2/10], Loss: 0.7025, Accuracy: 0.5288
Epoch [3/10], Loss: 0.6815, Accuracy: 0.5637
Epoch [4/10], Loss: 0.6811, Accuracy: 0.5375
Epoch [5/10], Loss: 0.6731, Accuracy: 0.6425
Epoch [6/10], Loss: 0.6605, Accuracy: 0.6650
Epoch [7/10], Loss: 0.6534, Accuracy: 0.6600
Epoch [8/10], Loss: 0.6568, Accuracy: 0.5212
Epoch [9/10], Loss: 0.6488, Accuracy: 0.6763
Epoch [10/10], Loss: 0.6501, Accuracy: 0.6475


  return _infer_schema(self._df)


Net(
  (embedding): Embedding(1807, 50)
  (cnn): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=3200, out_features=1, bias=True)
)
Epoch [1/40], Loss: 0.8121, Accuracy: 0.4888
Epoch [2/40], Loss: 0.7303, Accuracy: 0.5162
Epoch [3/40], Loss: 0.6782, Accuracy: 0.5400
Epoch [4/40], Loss: 0.6932, Accuracy: 0.5175
Epoch [5/40], Loss: 0.6708, Accuracy: 0.5725
Epoch [6/40], Loss: 0.6597, Accuracy: 0.6125
Epoch [7/40], Loss: 0.6537, Accuracy: 0.6275
Epoch [8/40], Loss: 0.6437, Accuracy: 0.5850
Epoch [9/40], Loss: 0.6217, Accuracy: 0.7225
Epoch [10/40], Loss: 0.6270, Accuracy: 0.6850
Epoch [11/40], Loss: 0.6160, Accuracy: 0.7000
Epoch [12/40], Loss: 0.6061, Accuracy: 0.7350
Epoch [13/40], Loss: 0.5933, Accuracy: 0.7100
Epoch [14/40], Loss: 0.6060, Accuracy: 0.7137
Epoch [15/40], Loss: 0.5798, Accuracy: 0.7400
Epoch [16/40], Loss: 0.5482, Accuracy: 0.8013
Epoch [17/40], 