In [22]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    tokens = tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

X_train = [
    get_bert_embeddings("Your free prize is waiting, click now!"),
    get_bert_embeddings("Hello friend, how are you?"),
    get_bert_embeddings("Special offer just for you, claim it now!"),
    get_bert_embeddings("Meeting rescheduled to next week.")
]
y_train = [1, 0, 1, 0]

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

X_test = [
    get_bert_embeddings("Limited time offer, buy now!"),
    get_bert_embeddings("Hey, are you coming to the party tonight?"),
    get_bert_embeddings("Get your free gift today, claim now!"),
    get_bert_embeddings("Let's catch up soon!")
]
y_test = [1, 0, 1, 0]

y_pred = [classifier.predict([embedding])[0] for embedding in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

def predict_spam(input_text):
    embedding = get_bert_embeddings(input_text)
    prediction = classifier.predict([embedding])[0]
    print(f'The input: "{input_text}" is classified as: {"Spam" if prediction == 1 else "Not Spam"}')
    return prediction

input_text = input("Enter a message to classify as spam or not: ")
predict_spam(input_text)


Accuracy: 0.7500
Enter a message to classify as spam or not: Get your gift today claim now!
The input: "Get your gift today claim now!" is classified as: Spam


1