In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib

In [2]:
# Load data
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

df.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Prepare data for model
X = df['text']
y = df['label'].map({'ham': 0, 'spam': 1})

In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Create and train the model
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB()),
])

In [6]:
model.fit(X_train, y_train)

In [7]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


Accuracy: 0.9668161434977578
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [8]:
# Save the model
joblib.dump(model, 'sms_spam_model.joblib')

['sms_spam_model.joblib']

## Multilingual Spam Detection

In [14]:
from transformers import pipeline

# Initialize the text classification pipeline with a multilingual model
classifier = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

def classify_text(text):
    result = classifier(text)[0]
    # The model isn't specifically trained for spam, so we'll use a heuristic
    # If the confidence is high and it's classified as a specific language, it's likely not spam
    if result['score'] > 0.9:
        return "NOT SPAM"
    else:
        return "SPAM"

# Test with the German text
german_text = """
Hallo,
Wären Sie daran interessiert, Ihr bestehendes Unternehmen mit einer neuen Website online zu stellen?
Ich habe ein erfahrenes Website-Design-Team, das eine sehr professionelle Website erstellt, die sich wirklich einfacher selbst verwalten lässt.
Ich bin sicher, dass Ihnen Ihre neue Homepage im modernen Design zu einem sehr erschwinglichen Preis gefällt.
Ich würde mich sehr freuen, wenn Sie mir Ihre Idee oder Grundvoraussetzung für die Erstellung einer professionellen Website mitteilen könnten. Wir unterbreiten Ihnen dann einen kurzen Unternehmensvorschlag zu einem sehr erschwinglichen Preis.
Mit freundlichen Grüßen,
George
"""

result = classify_text(german_text)
print(f"The text is classified as: {result}")

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

The text is classified as: NOT SPAM


