In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import pickle
import re

In [2]:
# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chadsglm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chadsglm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load data
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

In [4]:
# Text preprocessing function

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ps = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [self._preprocess(text) for text in X]

    def _preprocess(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = nltk.word_tokenize(text)
        text = [self.ps.stem(word) for word in text if word.isalnum() and word not in stopwords.words('english')]
        return " ".join(text)

In [6]:
# Prepare data for model
X = df['text']
y = df['label'].map({'ham': 0, 'spam': 1})

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Model pipeline with custom transformer
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('model', MultinomialNB())
])

In [9]:
# Hyperparameter tuning using GridSearchCV
param_grid = {'model__alpha': [0.1, 0.5, 1.0, 5.0]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Best model
best_model = grid_search.best_estimator_

In [None]:
# Evaluate model
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Save the trained model and vectorizer
with open('model.pkl', 'wb') as f:
    pickle.dump((tfidf, model), f)

In [None]:
email_text = """
Hallo,

Wären Sie daran interessiert, Ihr bestehendes Unternehmen mit einer neuen Website online zu stellen?

Ich habe ein erfahrenes Website-Design-Team, das eine sehr professionelle Website erstellt, die sich wirklich einfacher selbst verwalten lässt.

Ich bin sicher, dass Ihnen Ihre neue Homepage im modernen Design zu einem sehr erschwinglichen Preis gefällt.

Ich würde mich sehr freuen, wenn Sie mir Ihre Idee oder Grundvoraussetzung für die Erstellung einer professionellen Website mitteilen könnten. Wir unterbreiten Ihnen dann einen kurzen Unternehmensvorschlag zu einem sehr erschwinglichen Preis.

Mit freundlichen Grüßen,
George
"""

# Assuming transform_text, tfidf, and model are already defined and loaded
transformed_email = transform_text(email_text)
vector_input = tfidf.transform([transformed_email])
result = model.predict(vector_input)[0]

print("Spam" if result == 1 else "Not Spam")
