Samodzielnie zbuduj i przetestuj naiwny klasyfikator Bayesa na zbiorze smsspamcollection.csv. Możesz do tego celu wykorzystać dowolne narzędzie (RapidMiner, Python, Orange Data Mining). Wyniki graj na współdzielony dysk Google do niedzieli, 12 maja, godz. 21:00. Jeżeli realizujesz zadanie w notatniku/kodzie Pythona, prześlij ten kod w postaci pliku imie.nazwisko.ipynb. Jeśli wykorzystujesz RapidMiner lub Orange Data Mining, prześlij plik imie.nazwisko.pdf zawierający dwa zrzuty ekranu: zrzut pokazujący cały workflow, oraz zrzut pokazujący macierz pomyłek. Zadanie uznam za zrealizowane, jeśli ogólna dokładność klasyfikatora będzie większa niż 90%.

In [70]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [71]:
data = pd.read_csv('smsspamcollection.csv', sep='\t', header=None, names=['label', 'sms_message'])

In [72]:
def preprocess_text(message):

    message = message.lower()
    message = message.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(message)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    
    return ' '.join(filtered_words)

data['sms_message'] = data['sms_message'].apply(preprocess_text)


In [73]:
data.head()

Unnamed: 0,label,sms_message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [74]:
vectorizer = TfidfVectorizer()
tfidf_data = vectorizer.fit_transform(data['sms_message'])

In [85]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_data, data['label'], test_size=0.2)

In [86]:
class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'var': X_c.var(axis=0),
                'prior': X_c.shape[0] / X.shape[0]
            }

    def calculate_likelihood(self, mean, var, x):
        eps = 1e-4
        coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
        exponent = np.exp(-(x - mean) ** 2 / (2 * var + eps))
        return coeff * exponent

    def predict(self, X):
        outputs = []
        for x in X:
            posteriors = []
            for c in self.classes:
                prior = np.log(self.parameters[c]['prior'])
                conditional = np.sum(np.log(self.calculate_likelihood(
                    self.parameters[c]['mean'], self.parameters[c]['var'], x)))
                posterior = prior + conditional
                posteriors.append(posterior)
            outputs.append(self.classes[np.argmax(posteriors)])
        return np.array(outputs)

In [87]:
gnb = GaussianNaiveBayes()
gnb.fit(X_train.toarray(), y_train)
y_pred = gnb.predict(X_test.toarray())

  conditional = np.sum(np.log(self.calculate_likelihood(


In [88]:
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9004484304932735


In [89]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

model = GaussianNB()

model.fit(X_train_dense, y_train)


In [90]:
y_pred = model.predict(X_test_dense)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Accuracy: 0.873542600896861
              precision    recall  f1-score   support

         ham       0.98      0.87      0.92       961
        spam       0.52      0.90      0.66       154

    accuracy                           0.87      1115
   macro avg       0.75      0.88      0.79      1115
weighted avg       0.92      0.87      0.89      1115



In [91]:
my_predictions = gnb.predict(X_test_dense)

sklearn_predictions = model.predict(X_test_dense)


  conditional = np.sum(np.log(self.calculate_likelihood(


In [92]:
from sklearn.metrics import confusion_matrix
my_conf_matrix = confusion_matrix(y_test, my_predictions)

sklearn_conf_matrix = confusion_matrix(y_test, sklearn_predictions)


In [93]:
print(my_conf_matrix)

[[942  19]
 [ 92  62]]


In [94]:
print(sklearn_conf_matrix)

[[836 125]
 [ 16 138]]
