In [2]:
!pip install transformers==4.49



In [3]:
import boto3
import sagemaker
import tarfile
import os

# check we are connected to the right region
session = boto3.session.Session()
print(session.region_name)

# check role
role = sagemaker.get_execution_role()
print(role)

# check if we can connect to the S3 bucket
s3 = boto3.client("s3")
response = s3.list_buckets()
print("Buckets in this account:")
for bucket in response["Buckets"]:
    print(" -", bucket["Name"])

# check directory + files
root_dir = os.getcwd()
print(os.listdir())
print(root_dir)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
ca-central-1
arn:aws:iam::147795258718:role/service-role/AmazonSageMaker-ExecutionRole-20250827T010343
Buckets in this account:
 - sagemaker-ca-central-1-147795258718
 - sagemaker-studio-147795258718-ywchj6yljj8
['.bashrc', '.sagemaker_sql_editor_api_cache', '.local', '.ipython', '.npm', '.jupyter', '.ipynb_checkpoints', '.cache', '.config', 'user-default-efs', '.virtual_documents', '.keras', 'Deploy.ipynb']
/home/sagemaker-user


In [5]:
# test if s3 file exists
from botocore.exceptions import ClientError
def s3_file_exists(s3_uri):
    # Split the s3:// URI
    parts = s3_uri.replace("s3://", "").split("/", 1)
    bucket = parts[0]
    key = parts[1] if len(parts) > 1 else ""
    try:
        s3.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == "404":
            return False
        else:
            raise
model_s3_uri = "s3://sagemaker-studio-147795258718-ywchj6yljj8/model.tar.gz"
print(s3_file_exists(model_s3_uri))

True


In [8]:
# define file locations
work_dir = os.getcwd()
print(f'work_dir: {work_dir}')
model_base = 'model'
# S3 locations
s3_bucket_name = "sagemaker-studio-147795258718-ywchj6yljj8"
s3_model_gz = f"{model_base}.tar.gz"
# local locations
local_model_gz  = os.path.join(work_dir, f"{model_base}.tar.gz")
local_model_dir = os.path.join(work_dir, model_base)

work_dir: /home/sagemaker-user


In [9]:
# download files from S3
s3.download_file(s3_bucket_name, s3_model_gz, local_model_gz)
# check if tar.gs is in sagemaker
print("tarfile downloaded:", os.path.exists(local_model_gz))
# extract tar.gz
with tarfile.open(local_model_gz, "r:gz") as tar:
    tar.extractall(path=work_dir, filter='data')
# check if model is in sagemaker
print("model exists:", os.path.exists(local_model_dir))

Tarfile Downloaded? True
Model Exists? True


In [10]:
# load model + weights
import torch
from transformers import DistilBertModel, DistilBertTokenizer

class DBERT_finetuned(torch.nn.Module):

    def __init__(self):

        super().__init__()
    
        self.tokenizer      = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.l1             = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = torch.nn.Linear(768,768)
        self.relu           = torch.nn.ReLU()
        self.dropout        = torch.nn.Dropout(0.3)
        self.classifier     = torch.nn.Linear(768,3)

    def forward(self, input_ids, attention_mask):

        output_1     = self.l1(input_ids=input_ids,attention_mask=attention_mask)
        hidden_state = output_1[0]
        x            = hidden_state[:,0]
        x            = self.pre_classifier(x)
        x            = self.relu(x)
        x            = self.dropout(x)
        x            = self.classifier(x)

        return x

model = DBERT_finetuned()
model.load_state_dict(torch.load(local_model_dir, map_location=torch.device('cpu')))
model.eval()

2025-08-28 15:26:01.293029: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


DBERT_finetuned(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [12]:
# load the test dataset
import pandas as pd
test_dataset = pd.read_csv("filename.csv")

In [18]:
# perform validation test
MAX_LEN = 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encode_dict = {0:'neutral', 1:'positive', 2:'negative'}
decode_dict = {j:i for i,j in encode_dict.items()}

x = test_dataset.Headline
y = test_dataset.ENCODE_CAT
model.eval()

num_correct = 0
num_wrong = 0

with torch.no_grad():

    for i in range(x.size):
        # print(i)

        title = str(x[i])
        title = " ".join(title.split())
        category = int(y[i])

        inputs = model.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        data = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            # 'targets': torch.tensor(y[i], dtype=torch.long)
        }

        ids = data['ids'].unsqueeze(0).to(device, dtype = torch.long)
        mask = data['mask'].unsqueeze(0).to(device, dtype = torch.long)
        # targets = data['targets'].unsqueeze(0).to(device, dtype = torch.long)

        outputs = model(ids, mask).squeeze()

        _val, _idx = torch.max(outputs, dim=0)
        
        # check if model sentiment equals real sentiment
        _model_guess = int(_idx)
        if _model_guess == category:
            num_correct += 1
        else: 
            # print mistakes
            print()
            print(f'model guess: {encode_dict[_model_guess]}')
            print(f'correct: {encode_dict[category]}')
            print(title)
            num_wrong += 1

print(num_correct / (num_correct + num_wrong))


model guess: positive
correct: neutral
Because expenditures must be justified to pass budget approval hurdles , we believe our RoP model can help make it easier for IT and IT security practitioners to make the business case for acquiring enabling security technologies and related control activities .

model guess: neutral
correct: positive
fi is developing cooperation in keyword advertising with Microsoft .

model guess: neutral
correct: negative
Finnish Scanfil , a systems supplier and contract manufacturer to the communications sector and the electronics industry , reports net sales of EUR 49.6 mn in the first quarter of 2009 , which are only a per cent smaller than in the corresponding period in 2008 .

model guess: positive
correct: negative
ADP News - Apr 22 , 2009 - Finnish business information systems developer Solteq Oyj HEL : STQ1V said today its net loss widened to EUR 189,000 USD 245,000 for the first quarter of 2009 from EUR 10,000 for the same peri

model guess: positive


1

In [19]:
print(f'validation score: {num_correct / (num_correct + num_wrong)}')

validation score: 0.9420289855072463


In [41]:
# manual testing
input_data = "Nasdaq edges down as Nvidia falls on China market uncertainty"
inputs = model.tokenizer(input_data, return_tensors="pt")
ids = inputs['input_ids']
mask = inputs['attention_mask']

with torch.no_grad():
    outputs = model(ids, mask)
probabilities = torch.softmax(outputs, dim = 1).numpy()
predicted_class = probabilities.argmax(axis=1)[0]
predicted_label = encode_dict[predicted_class]
print(f'probabilities: {probabilities.tolist()}')
print(f'predicted_label: {predicted_label}')

probabilities: [[0.19796720147132874, 0.02515009231865406, 0.7768826484680176]]
predicted_label: negative
