# Imports

In [1]:
from numpy import bincount
from torch.optim import AdamW
import matplotlib.pyplot as plt
from tqdm import tqdm as progress
from torch.nn.utils import clip_grad_norm_ 
from pandas import read_csv, DataFrame, concat
from torch.nn.functional import relu, max_pool1d
from torch import IntTensor, FloatTensor, no_grad
from torch import device, cuda, save, load, sigmoid  
from sklearn.model_selection import train_test_split
from torch.nn import ModuleList, Embedding, Dropout, Conv1d, GRU, Linear, BCELoss
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, auc, confusion_matrix, ConfusionMatrixDisplay

# Constant Variables
batch_size = 256
device = device("cuda" if cuda.is_available() else "cpu")

# Loading SMS Data

In [2]:
df = read_csv("../../data/sms.csv")
df = df.drop(columns=["Unnamed: 0"])
df.sample(5)

Unnamed: 0,text,type
1618,"[3, 2772, 893, 1549, 3347, 531, 123, 1315, 449...",0
878,"[57, 1038, 1039, 325, 3, 218, 651, 57, 16, 429...",1
4683,"[622, 554, 624, 15, 825, 283, 622, 15, 825, 28...",0
3337,"[156, 15, 57, 1652, 463, 148, 17, 17, 0, 0, 0,...",0
2139,"[660, 211, 539, 873, 736, 159, 17, 17, 0, 0, 0...",0


# Preparing SMS Data


In [3]:
# Load & Prepare Data
df["text"] = df["text"].apply(lambda sms: [int(word) for word in sms[1:-1].split(", ")])
df["type"] = df["type"].apply(lambda label: int(label))

# # Train / Valid / Test Split
x_train, x_test, y_train, y_test = train_test_split(df["text"], df["type"], test_size=0.30, stratify=df["type"], random_state=2022)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.50, stratify=y_test, random_state=2022)

# Training weight
train_weight = (1./bincount(y_train))[y_train]

# Train / Valid / Test TensorDataset
train_dataset = TensorDataset(IntTensor(x_train.to_list()), FloatTensor(y_train.to_list()))
valid_dataset =  TensorDataset(IntTensor(x_valid.to_list()), FloatTensor(y_valid.to_list()))
test_dataset = TensorDataset(IntTensor(x_test.to_list()), FloatTensor(y_test.to_list()))

# Train / Test / Valid Loaders
train_loader = DataLoader(train_dataset, sampler=WeightedRandomSampler(train_weight, len(train_weight)-1), batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Model

In [4]:
class SMSClassifier(ModuleList):
    def __init__(self, path):
        super(SMSClassifier, self).__init__()
        self.path = path
        self.embedding = Embedding(num_embeddings=178348, embedding_dim=8, padding_idx=0)
        self.dropout1 = Dropout(0.5)
        self.conv1 = Conv1d(300, 200, kernel_size=3, stride=2, padding=1)
        self.dropout2 = Dropout(0.5)
        self.conv2 = Conv1d(200, 100, kernel_size=3, stride=2, padding=1)
        self.dropout3 = Dropout(0.5)
        self.conv3 = Conv1d(100, 64, kernel_size=3, stride=2, padding=1)
        self.dropout4 = Dropout(0.5)
        self.gru1 = GRU(input_size=1, hidden_size=8, batch_first=True, bidirectional=True)
        self.dropout5 = Dropout(0.5)
        self.gru2 = GRU(input_size=8, hidden_size=16, batch_first=True, bidirectional=True)
        self.dropout6 = Dropout(0.5)
        self.gru3 = GRU(input_size=16, hidden_size=32, batch_first=True, bidirectional=True)
        self.dropout7 = Dropout(0.5)
        self.fc1 = Linear(64, 32)
        self.fc2 = Linear(32, 16)
        self.fc3 = Linear(16, 1)

    def forward(self, sentence):
        sentence = self.embedding(sentence)
        sentence = self.dropout1(sentence)
        sentence = self.conv1(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=3, stride=2, padding=1)
        sentence = self.dropout2(sentence)
        sentence = self.conv2(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=3, stride=2, padding=1)
        sentence = self.dropout3(sentence)
        sentence = self.conv3(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=3, stride=2, padding=1)
        sentence = self.dropout4(sentence)
        sentence, (hidden, cell) = self.gru1(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=3, stride=2, padding=1)
        sentence = self.dropout5(sentence)
        sentence, (hidden, cell) = self.gru2(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=3, stride=2, padding=1)
        sentence = self.dropout6(sentence)
        sentence, (hidden, cell) = self.gru3(sentence)
        sentence = relu(sentence, inplace=False)
        sentence = max_pool1d(sentence, kernel_size=32, stride=64, padding=1)
        sentence = sentence.view(-1, 64)
        sentence = self.dropout7(sentence)
        sentence = self.fc1(sentence)
        sentence = relu(sentence, inplace=True)
        sentence = self.fc2(sentence)
        sentence = relu(sentence, inplace=True)
        sentence = self.fc3(sentence).squeeze()
        return sigmoid(sentence)

    def fit(self, loader, epochs, optim, loss_fn):
        train_loss, valid_loss = [1], [1]
        for epoch in (tracker := progress(range(epochs))):
            tracker.set_description(f"Epoch #{epoch+1}")
            # Train Step
            train_loss.append(self.train_step(loader[0], optim, loss_fn))
            # Eval Step
            valid_loss.append(loss := self.eval_step(loader[1], loss_fn))
            # Updating tracker
            if epoch == epochs-1:
                for step, loss in enumerate(valid_loss):
                    if loss == min(valid_loss):
                        tracker.set_description(f"Best Epoch #{step}")
                        tracker.set_postfix_str(f"Training Loss = {train_loss[step]}, Validiation Loss = {valid_loss[step]}")
            else:
                tracker.set_postfix_str(f"Training Loss = {train_loss[-1]}, Validiation Loss = {valid_loss[-1]}")
            # Saving Model If Performance Improves
            if loss <= min(valid_loss):
                self.save()
            
        # Plotting Results
        fig, ax = plt.subplots(figsize=(15, 5), dpi=300)
        ax.set_title("Training Loss (Blue) vs Validation Loss (Red)")
        ax.plot(train_loss, "b")
        ax.plot(valid_loss, "r")
        ax.set_xlim((0, len(train_loss)-1))
        ax.set_ylim((0, 1))

    def train_step(self, loader, optim, loss_fn):
        losses = []
        self.train()
        for step, batch in enumerate(loader):
            self.zero_grad()
            # Training Step
            prediction = self.forward(batch[0].to(device))
            loss = loss_fn(prediction, batch[1].type(FloatTensor).to(device))
            # Recording Training Result
            losses.append(loss.item())
            # Refresh Step
            loss.backward()
            clip_grad_norm_(self.parameters(), max_norm=1.0, norm_type=1)
            optim.step()
        return sum(losses)/len(losses)

    def eval_step(self, loader, loss_fn):
        losses = []
        self.eval()
        with no_grad():
            for step, batch in enumerate(loader):
                # Evaluation Step
                prediction = self.forward(batch[0].to(device))
                loss = loss_fn(prediction, batch[1].type(FloatTensor).to(device))
                # Recording Evaluation Result
                losses.append(loss.item())       
        return sum(losses)/len(losses)

    def evaluate(self, loader):
        self.load()
        y_true, y_pred = [], []
        self.eval()
        with no_grad():
            for step, batch in enumerate(loader):
                y_pred.extend([int(pred > 0.5) for pred in self.forward(batch[0].to(device)).cpu().numpy()])
                y_true.extend(batch[1])
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        rc1, rc2, rc3 = precision_recall_curve(y_true, y_pred)
        pr_auc = auc(rc2,  rc1)
        print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nPR_AUC: {pr_auc}")
        ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred), display_labels=[0, 1]).plot()
        plt.show()
        return accuracy, precision, recall, pr_auc
    
    def saveResults(self, lib, nam, typ, acc, pre, rec, pr):
        df = read_csv("../../data/results.csv")
        df2 = DataFrame([[lib, nam, typ, acc, pre, rec, pr]], 
                        columns=["Library", "Model", "Type", "Accuracy", "Precision", "Recall", "PR_AUC"])
        df = concat([df2, df])
        if "Unnamed: 0" in df:
            df = df.drop(columns=["Unnamed: 0"])
        df.to_csv("../../data/results.csv")

    def save(self):
        save(self.state_dict(), self.path)

    def load(self):
        self.load_state_dict(load(self.path))

# Model Summary

In [5]:
model = SMSClassifier(path="../../models/cnn_gru.pt").to(device)
print(model)

SMSClassifier(
  (embedding): Embedding(178348, 8, padding_idx=0)
  (dropout1): Dropout(p=0.5, inplace=False)
  (conv1): Conv1d(300, 200, kernel_size=(3,), stride=(2,), padding=(1,))
  (dropout2): Dropout(p=0.5, inplace=False)
  (conv2): Conv1d(200, 100, kernel_size=(3,), stride=(2,), padding=(1,))
  (dropout3): Dropout(p=0.5, inplace=False)
  (conv3): Conv1d(100, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (dropout4): Dropout(p=0.5, inplace=False)
  (gru1): GRU(1, 8, batch_first=True, bidirectional=True)
  (dropout5): Dropout(p=0.5, inplace=False)
  (gru2): GRU(8, 16, batch_first=True, bidirectional=True)
  (dropout6): Dropout(p=0.5, inplace=False)
  (gru3): GRU(16, 32, batch_first=True, bidirectional=True)
  (dropout7): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)


# Model Training

In [None]:
model.fit([train_loader, valid_loader], 300, AdamW(model.parameters(), lr=1e-3), BCELoss())

Epoch #245:  81% 244/300 [01:49<00:25,  2.23it/s, Training Loss = 0.013033062444264942, Validiation Loss = 0.11628417391330004]

# Model Evaluation

In [None]:
acc, pre, re, pr = model.evaluate(train_loader)
model.saveResults("pytorch", "cnn_gru", "train", acc, pre, re, pr)

In [None]:
acc, pre, re, pr = model.evaluate(valid_loader)
model.saveResults("pytorch", "cnn_gru", "valid", acc, pre, re, pr)

In [None]:
acc, pre, re, pr = model.evaluate(test_loader)
model.saveResults("pytorch", "cnn_gru", "test", acc, pre, re, pr)