# **Read Data**

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/train_preprocess.tsv', sep = '\t', header=None, names=['kalimat', 'sentimen'])
test = pd.read_csv('/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/test_preprocess.tsv', sep = '\t', header=None, names=['kalimat', 'sentimen'])
valid = pd.read_csv('/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/valid_preprocess.tsv',sep='\t', header=None, names=['kalimat', 'sentimen'])

In [3]:
train.head()

Unnamed: 0,kalimat,sentimen
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [4]:
test.head()

Unnamed: 0,kalimat,sentimen
0,kemarin gue datang ke tempat makan baru yang a...,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative


In [5]:
valid.head()

Unnamed: 0,kalimat,sentimen
0,"meski masa kampanye sudah selesai , bukan bera...",neutral
1,tidak enak,negative
2,restoran ini menawarkan makanan sunda . kami m...,positive
3,lokasi di alun alun masakan padang ini cukup t...,positive
4,betapa bejad kader gerindra yang anggota dprd ...,negative


# **Text Processing**

In [6]:
import re
import torch
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Cleaning Text
def clean_text(text):
    # Lowercase
    text = text.lower()  

    # Remove numbers
    text = re.sub(r'\d+', '', text)  

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)  

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)  
    return text

# Apply Function
train['kalimat'] = train['kalimat'].apply(clean_text)
valid['kalimat'] = valid['kalimat'].apply(clean_text)
test['kalimat'] = test['kalimat'].apply(clean_text)

# **Prepare Data for Model**

In [9]:
# Mapping sentiment
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train['label'] = train['sentimen'].map(label_map)
valid['label'] = valid['sentimen'].map(label_map)
test['label'] = test['sentimen'].map(label_map)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [10]:
# Encoding & Tokenization Function
def encode_data(text_list, tokenizer, max_length=128):
    return tokenizer(text_list, 
                     padding='max_length', 
                     truncation=True, 
                     max_length=max_length, 
                     return_tensors='pt')

# Text Tokenization
train_encodings = encode_data(train['kalimat'].tolist(), tokenizer)
valid_encodings = encode_data(valid['kalimat'].tolist(), tokenizer)
test_encodings = encode_data(test['kalimat'].tolist(), tokenizer)

# Label Convertion to tensor PyTorch
train_labels = torch.tensor(train['label'].values)
valid_labels = torch.tensor(valid['label'].values)
test_labels = torch.tensor(test['label'].values)

In [11]:
# Class Dataset 
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Dataloader for training, validation, and testing
train_dataset = SentimentDataset(train_encodings, train_labels)
valid_dataset = SentimentDataset(valid_encodings, valid_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=632, shuffle=False)

# **Load Pretrained BERT Model**

In [12]:
from transformers import BertForSequenceClassification, AdamW

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Training Loop**

In [13]:
from transformers import get_scheduler
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [14]:
# Set up loss function and scheduler
loss_fn = CrossEntropyLoss()
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [15]:
# Training Loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        # Move batch to the correct device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 344/344 [41:58<00:00,  7.32s/it]


Epoch 1, Training Loss: 0.6228


100%|██████████| 344/344 [46:20<00:00,  8.08s/it]


Epoch 2, Training Loss: 0.4311


100%|██████████| 344/344 [41:25<00:00,  7.22s/it]

Epoch 3, Training Loss: 0.3567





# **Validation**

In [16]:
from sklearn.metrics import accuracy_score

# Evaluation Function
def evaluate_model(model, valid_loader):
    model.eval()
    predictions, true_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(valid_loader):
            # Move batch to the correct device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    avg_val_loss = val_loss / len(valid_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}")

# Evaluate on validation data
evaluate_model(model, valid_loader)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 40/40 [00:58<00:00,  1.47s/it]

Validation Loss: 0.4277, Validation Accuracy: 0.8357





# **Save the Model**

In [17]:
import os

# Model Path
model_save_path = "/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/saved_model"
os.makedirs(model_save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/saved_model/tokenizer_config.json',
 '/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/saved_model/special_tokens_map.json',
 '/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/saved_model/vocab.txt',
 '/Users/dimasrafly/Documents/Test Application/Alphalitical Data Scientist Test/saved_model/added_tokens.json')

# **Predict**

In [18]:
# Prediction Function
def predict_sentiment(model, tokenizer, text):
    # Clean and tokenize input text
    text = clean_text(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)

    # Move input to the correct device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict sentiment
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).cpu().item()

    # Reverse Label from num to str
    reverse_label_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    return reverse_label_map[predicted_class]

In [20]:
# Sentiment Prediction
sample_text = "kita akan semakin cerdas jika kita terus menerus melatih otak kita"
predicted_sentiment = predict_sentiment(model, tokenizer, sample_text)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: negative
