In [199]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [200]:
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

In [201]:
# Loading data and change column names
data = pd.read_csv('spam.csv', encoding='ISO-8859-1', usecols=['v1', 'v2'])
data.columns = ['label', 'message']

In [202]:
data 

In [203]:
# Cleaning text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

data['message'] = data['message'].apply(clean_text)

In [204]:
data['tokens'] = data['message'].apply(word_tokenize)

In [205]:
data['filtered_tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [206]:
all_words = [word for tokens in data['filtered_tokens'] for word in tokens]
word_counts = Counter(all_words)

In [207]:
print(word_counts.most_common(10))

In [208]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [209]:
vectorizer = CountVectorizer(tokenizer = lambda x: x, preprocessor = lambda x: x)
X = vectorizer.fit_transform(data['filtered_tokens'])
y = data['label']

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [211]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [212]:
y_pred = model.predict(X_test)

In [213]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [214]:
print(f'Accuracy: {accuracy:.2f}')
print(f'Report: {report}')