### Loading Required Libraries

In [23]:
pip install torch langdetect transformers nltk openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import re
import nltk
from langdetect import detect

In [24]:
# nltk for stopword processing
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Define Multilingual Stopwords

In [25]:
# Initialize an empty set for all stopwords
stop_words = set()

# Adding English stopwords from nltk
stop_words.update(stopwords.words('english'))

# Manually define stopwords for other languages
malay_stopwords = {"dan", "adalah", "di", "ke", "yang", "ini", "itu", "saya", "dengan"}  # Expand as needed
chinese_stopwords = {"的", "了", "在", "是", "我", "有", "和", "也", "就"}  # Expand with Chinese stopwords
japanese_stopwords = {"の", "に", "は", "を", "た", "が", "で", "て", "と"}  # Expand with Japanese stopwords
korean_stopwords = {"그리고", "그녀는", "나는", "그들", "이것", "그것", "우리"}  # Expand with Korean stopwords
thai_stopwords = {"และ", "ใน", "เป็น", "ก็", "ว่า", "ได้", "มี", "หรือ"}  # Expand with Thai stopwords
vietnamese_stopwords = {"và", "trong", "là", "của", "để", "một", "có", "không"}  # Expand with Vietnamese stopwords

# Add these lists to the main stop_words set
stop_words.update(malay_stopwords)
stop_words.update(chinese_stopwords)
stop_words.update(japanese_stopwords)
stop_words.update(korean_stopwords)
stop_words.update(thai_stopwords)
stop_words.update(vietnamese_stopwords)

# Adding "rt" for retweet mentions
stop_words.add("rt")

### Symbol Replacement Function

In [26]:
# Define commonly masked symbols and their replacements
symbol_replacements = {"@": "a", "!": "i", "$": "s", "0": "o", "1": "i"}

def replace_symbols(text):
    # Replace symbols in commonly masked words only
    for symbol, letter in symbol_replacements.items():
        text = re.sub(rf"(?<!\w){symbol}(?=\w)|(?<=\w){symbol}(?!\w)", letter, text)
    return text

### Preprocessing Function with Language Detection

In [27]:
def preprocess_mixed_language(text):
    # Detect language (skip if detection fails)
    try:
        lang = detect(text)
    except:
        lang = "unknown"
    
    # Replace symbols in inappropriate words while retaining URLs and special characters
    text = replace_symbols(text)
    
    # Apply language-specific stopwords
    if lang == "en":
        text = " ".join([word for word in text.split() if word.lower() not in stop_words])
    elif lang == "ms":  # Malay
        text = " ".join([word for word in text.split() if word.lower() not in malay_stopwords])
    elif lang == "zh":  # Chinese
        text = " ".join([word for word in text.split() if word.lower() not in chinese_stopwords])
    elif lang == "ja":  # Japanese
        text = " ".join([word for word in text.split() if word.lower() not in japanese_stopwords])
    elif lang == "ko":  # Korean
        text = " ".join([word for word in text.split() if word.lower() not in korean_stopwords])
    elif lang == "th":  # Thai
        text = " ".join([word for word in text.split() if word.lower() not in thai_stopwords])
    elif lang == "vi":  # Vietnamese
        text = " ".join([word for word in text.split() if word.lower() not in vietnamese_stopwords])

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


### Load and Preprocess Data

In [28]:
# Load your dataset (adjust path as necessary)
df = pd.read_excel('Training_LLM_Modified.xlsx')

# Ensure 'Label' column only contains integers (0 or 1) by filtering out invalid rows
df = df[df['Label'].isin([0, 1])]

# Preprocess text and create 'clean_text' column
df['clean_text'] = df['Text'].apply(preprocess_mixed_language)

# Extract lists for texts and labels after cleaning
texts = df['clean_text'].tolist()
labels = df['Label'].astype(int).tolist()  # Convert labels to integers

# Split into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


### Tokenization and DataLoader Setup

In [29]:
# Tokenize with mBERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Define Dataset class for PyTorch
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are long tensors
        return item

    def __len__(self):
        return len(self.labels)

# Create Dataset and DataLoader objects
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

### Model Setup and Fine-Tuning

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tuning loop
for epoch in range(5):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            val_accuracy += torch.sum(predictions == labels).item() / labels.size(0)

    avg_val_accuracy = val_accuracy / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Accuracy: {avg_val_accuracy}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Validation Accuracy: 0.9722222222222222
Epoch 2, Validation Accuracy: 0.9722222222222222
Epoch 3, Validation Accuracy: 0.9861111111111112
Epoch 4, Validation Accuracy: 0.9742063492063492
Epoch 5, Validation Accuracy: 0.9682539682539683


### Model Evaluation

In [31]:
true_labels = []
predicted_labels = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predictions.cpu().numpy())

# Classification Report and Evaluation Metrics
print("Classification Report:\n", classification_report(true_labels, predicted_labels, target_names=['Appropriate', 'Inappropriate']))
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Classification Report:
                precision    recall  f1-score   support

  Appropriate       0.95      0.99      0.97       405
Inappropriate       0.99      0.91      0.95       220

     accuracy                           0.96       625
    macro avg       0.97      0.95      0.96       625
 weighted avg       0.97      0.96      0.96       625

Accuracy: 0.9648
Precision: 0.9852941176470589
Recall: 0.9136363636363637
F1-score: 0.9481132075471698


### User Prompt for Prediction 

In [49]:
def classify_text(user_input):
    # Preprocess and tokenize the input text
    preprocessed_text = preprocess_mixed_language(user_input)
    inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    
    # Move inputs to device (CPU or GPU as available)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Predict with the model
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_label].item()

    # Output results
    label_text = "Inappropriate" if predicted_label == 1 else "Appropriate"
    print(f"Text Classification: {label_text} (Label: {confidence:.2f})")

# Example usage
user_input = input("Enter text to classify: ")
classify_text(user_input)

Text Classification: Appropriate (Label: 0.76)


### Save Model for ONNX Conversion 

In [33]:
model_save_path = "fine_tuned_mBERT_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path} for ONNX conversion.")

Model saved to fine_tuned_mBERT_model for ONNX conversion.


In [None]:
import sys
print(f"Python version: {sys.version}")
