# Clean.py

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import torch.nn.functional as F
import nltk
import string
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import json


tokenizer = AutoTokenizer.from_pretrained("sbcBI/sentiment_analysis_model")
tokenizer_path = os.path.join("artifact", "tokenizer")
tokenizer.save_pretrained(tokenizer_path)
# model = AutoModelForSequenceClassification.from_pretrained("sbcBI/sentiment_analysis_model", num_labels = 9, ignore_mismatched_sizes=True, problem_type="multi_label_classification")


json_data = pd.read_json("primate_dataset.json")

def clean_text(data):
  result = []
  for text in data:
      text = text.lower()
      text_p = "".join([char for char in text if char not in string.punctuation])
      words = word_tokenize(text_p)
      stop_words = stopwords.words('english')
      filtered_words = [word for word in words if word not in stop_words]
      porter = PorterStemmer()
      final = [porter.stem(word) for word in filtered_words]
      final = " ".join([porter.stem(word) for word in filtered_words])
      result.append(final)
  return result

def clean_annotations(data):
    Y = []
    for row in data:
        # print (row)
        individual = []
        for r in row:
            if r[1]=='yes':
                individual.append(1)
            else:
                individual.append(0)
        Y.append(individual)
    return Y


Y = clean_annotations(json_data['annotations'])
X = clean_text(json_data['post_text'])

print (len(X))
print (len(Y))


def split_string(text):
    words = text.split(" ")
    total_words = len(words)
    midpoint_index = total_words // 2
    first_half = ' '.join(words[:midpoint_index])
    second_half = ' '.join(words[midpoint_index:])
    return first_half, second_half


while (True):
    print ("================================")
    tokenized_texts = tokenizer(X, padding="max_length", truncation=True, truncation_strategy='only_last')
    print("tokenization done")
    indexes = [] # indexes to remove
    for i, t in enumerate(tokenized_texts['input_ids']):
      if t[-1] != 0:
        indexes.append(i)

    print(f"Indexes to remove -> {len(indexes)}")
    if len(indexes) == 0:
        break

    X_to_break = []
    y_to_break = []
    for ind in indexes:
        X_to_break.append(X[ind])
        y_to_break.append(Y[ind])

    print (f"X_to_break -> {len(X_to_break)}")
    new_x = []
    new_y = []
    for i, x in enumerate(X_to_break):
        a, b = split_string(x)
        new_x.append(a)
        new_x.append(b)
        new_y.append(y_to_break[i])
        new_y.append(y_to_break[i])
    print (f"new_x ->{len(new_x)}")

    # deletion from original data
    index_to_remove = indexes.copy()
    index_to_remove.sort(reverse=True)
    for ind in index_to_remove:
        del X[ind]
        del Y[ind]

    X = X + new_x
    Y = Y + new_y
    print(len(X))
    print(len(Y))

print("Final")
print(len(X))
print(len(Y))
tokenized_texts = tokenizer(X, padding="max_length", truncation=True, truncation_strategy='only_last')

overflowed = []
for i, t in enumerate(tokenized_texts['input_ids']):
    if (t[-1]!=0):
        overflowed.append(i)

print (len(overflowed))





# Define file paths
file_path1 = os.path.join("artifact", "inputs.json")
file_path2 = os.path.join("artifact", "outputs.json")

# Save list1 to JSON
with open(file_path1, "w") as f:
    json.dump(X, f)

# Save list2 to JSON
with open(file_path2, "w") as f:
    json.dump(Y, f)

print("Lists saved as JSON files.")





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2003
2003
tokenization done
Indexes to remove -> 74
X_to_break -> 74
new_x ->148
2077
2077
tokenization done
Indexes to remove -> 14
X_to_break -> 14
new_x ->28
2091
2091
tokenization done
Indexes to remove -> 0
Final
2091
2091
0
Lists saved as JSON files.


# train_test_split_model.py

In [2]:
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
import json
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


file_path1 = os.path.join("artifact", "inputs.json")
file_path2 = os.path.join("artifact", "outputs.json")


with open(file_path1, "r") as f:
    X = json.load(f)


with open(file_path2, "r") as f:
    Y = json.load(f)


model = AutoModelForSequenceClassification.from_pretrained("sbcBI/sentiment_analysis_model", num_labels = 9, ignore_mismatched_sizes=True, problem_type="multi_label_classification")
tokenizer_path = os.path.join("artifact", "tokenizer")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train_tokenized = tokenizer(X_train, padding="max_length", truncation=True, return_tensors="pt")
X_test_tokenized = tokenizer(X_test, padding="max_length", truncation=True, return_tensors="pt")
Y_train_tensor = torch.tensor(Y_train)
Y_test_tensor = torch.tensor(Y_test)


torch.save(X_test_tokenized, os.path.join('artifact', 'X_test_tokenized.pt'))
torch.save(Y_test_tensor, os.path.join('artifact', 'Y_test_tensor.pt'))
torch.save(X_train_tokenized, os.path.join('artifact', 'X_train_tokenized.pt'))
torch.save(Y_train_tensor, os.path.join('artifact', 'Y_train_tensor.pt'))


model.save_pretrained(os.path.join('artifact', "downloaded_model"))


print("Done splitting the data and downloaded the model")

config.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at sbcBI/sentiment_analysis_model and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done splitting the data and downloaded the model


# train.py

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from tqdm import tqdm



tokenizer_path = os.path.join("artifact", "tokenizer")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

model = AutoModelForSequenceClassification.from_pretrained(os.path.join("artifact", "downloaded_model"))

X_train_tokenized = torch.load(os.path.join("artifact", 'X_train_tokenized.pt')).to(device)
Y_train_tensor = torch.load(os.path.join("artifact", 'Y_train_tensor.pt')).to(device)
model.to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
dataset_train = TensorDataset(X_train_tokenized.input_ids, X_train_tokenized.attention_mask, Y_train_tensor.float())
train_dataloader = DataLoader(dataset_train, batch_size=8, shuffle=True)

print("Done loading model")
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    print(f"Running Epoch Number -> {epoch}")
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training")
    for step, batch in progress_bar:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}')

model.save_pretrained(os.path.join("artifact", "trained_model"))

Done loading model
Running Epoch Number -> 0


Training: 100%|██████████| 209/209 [01:19<00:00,  2.61it/s]


Epoch 1/1, Average Training Loss: 0.5219


# Quantized

In [7]:
import torch
import os
from torch.quantization import quantize_dynamic
from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(os.path.join("artifact", "trained_model"))

quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

input_ids = torch.randint(0, 1000, size=(1, 512), dtype=torch.long)
attention_mask = torch.randint(0, 2, size=(1, 512), dtype=torch.long)



dummy_input = (input_ids, attention_mask)
traced_model = torch.jit.trace(quantized_model, dummy_input, strict=False)
torch.jit.save(traced_model, os.path.join("artifact", "quantized_model.pt"))

  mask, torch.tensor(torch.finfo(scores.dtype).min)


# evaluate.py

In [14]:


from transformers import AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np


warnings.filterwarnings("ignore")
model_untrained = AutoModelForSequenceClassification.from_pretrained(os.path.join("artifact", "downloaded_model")).to(device)
model_unquantized = AutoModelForSequenceClassification.from_pretrained(os.path.join("artifact", "trained_model")).to(device)

model_quantized = torch.jit.load(os.path.join("artifact", "quantized_model.pt"))

models = {
    "Untrained_Model": model_untrained,
    "UnQuantized_Model": model_unquantized,
    "Quantized_Model": model_quantized
}
X_test_tokenized = torch.load(os.path.join('artifact', 'X_test_tokenized.pt')).to(device)
Y_test_tensor = torch.load(os.path.join('artifact', 'Y_test_tensor.pt')).to(device)
dataset_val = TensorDataset(X_test_tokenized.input_ids, X_test_tokenized.attention_mask, Y_test_tensor.float())
val_dataloader = DataLoader(dataset_val, batch_size=4, shuffle=False)



for model_name, model in models.items():
    print(model_name)
    if model_name == "Quantized_Model":
      device = "cpu"
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            try:
                logits = output.logits
            except Exception as e:
                logits = output['logits']
                # print(e)
            probs = torch.sigmoid(logits)
            threshold = 0.5
            binary_preds = (probs >= threshold).float()
            predictions = binary_preds
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())


    # Convert lists to numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    auc_roc = roc_auc_score(y_true, y_pred)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'AUC-ROC Score: {auc_roc:.4f}')

    # print(model.parameters)
    num_params = sum(p.numel() for p in model.parameters())
    size = num_params * 4 / (1024**2)
    print(f"Size of the {model_name} -> {size} MB")

    warnings.resetwarnings()
    print("================================================")

Untrained_Model
Accuracy: 0.0000
Precision: 0.6011
Recall: 0.7221
F1 Score: 0.6047
AUC-ROC Score: 0.5034
Size of the Untrained_Model -> 255.4336280822754 MB
UnQuantized_Model


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.0549
Precision: 0.4818
Recall: 0.5473
F1 Score: 0.5119
AUC-ROC Score: 0.5091
Size of the UnQuantized_Model -> 255.4336280822754 MB
Quantized_Model
Accuracy: 0.0549
Precision: 0.4848
Recall: 0.5466
F1 Score: 0.5130
AUC-ROC Score: 0.5101
Size of the Quantized_Model -> 90.99609375 MB


  _warn_prf(average, modifier, msg_start, len(result))
