In [1]:
import sys
sys.path.append('..')
from models import BayesClassifier
import pandas as pd
import requests

resp = requests.get('https://ait-main-mdl-euwest1.s3.eu-west-1.amazonaws.com/6a/75/6a751de13886411bffbec196f0771944276a98cf?response-content-disposition=inline%3B%20filename%3D%22SMSSpamCollection%22&response-content-type=text%2Fplain%3B%20charset%3Dutf-8&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAWRN6GJFLQXT45LXM%2F20241203%2Feu-west-1%2Fs3%2Faws4_request&X-Amz-Date=20241203T115057Z&X-Amz-SignedHeaders=host&X-Amz-Expires=21543&X-Amz-Signature=ef30adcf69cb5ed4a9f1c1aa361a5bc8377fa22cf0a7ae854e0bfc7d2870e935')
text = resp.text
data = pd.DataFrame(data = [row.split('\t') for row in text.split('\n') if row],columns = ['label','text'])

In [None]:
import numpy as np 

class BayesClassifier:
    __epsilon = 1e-9
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.var = {}
        self.prior = {}

    def fit(self, X, y):
        """
        Fit the classifier to the training data.
        :param X: ndarray of shape (n_samples, n_features) - Training data
        :param y: ndarray of shape (n_samples,) - Class labels
        """
        self.classes = np.unique(y)
        for cls in self.classes:
            X_cls = X[y == cls]
            self.mean[cls] = np.mean(X_cls, axis=0)
            self.var[cls] = np.var(X_cls, axis=0)+self.__epsilon
            self.prior[cls] = (X_cls.shape[0]+1) / (X.shape[0]+len(self.classes))

    def _gaussian_density(self, x, mean, var):
        """
        Compute the Gaussian probability density function.
        :param x: Value to compute density for
        :param mean: Mean of the Gaussian distribution
        :param var: Variance of the Gaussian distribution
        :return: Probability density
        """
        coeff = 1 / np.sqrt(2 * np.pi * var)
        exponent = np.exp(-(x - mean) ** 2 / (2 * var))
        res = coeff * exponent
        res[res <= 10**-320] = 10**-320
        return res #max(coeff * exponent,10**-320)

    def _predict_class(self, x):
        """
        Predict the class for a single sample.
        :param x: ndarray of shape (n_features,) - Single sample
        :return: Predicted class
        """
        posteriors = {}
        for cls in self.classes:
            prior = np.log(self.prior[cls])
            likelihood = np.sum(np.log(self._gaussian_density(x, self.mean[cls], self.var[cls])))
            posteriors[cls] = prior + likelihood
        return max(posteriors, key=posteriors.get)

    def predict(self, X):
        """
        Predict class labels for the input data.
        :param X: ndarray of shape (n_samples, n_features) - Test data
        :return: ndarray of shape (n_samples,) - Predicted class labels
        """
        return np.array([self._predict_class(x) for x in X])

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import multipledispatch

class Model:
    def __init__(self,data):
        self.model = BayesClassifier()
        self.countVectorizer = CountVectorizer()
        self.data = data
        self.__fit = False
    
    def fit(self):
        Xtrain, Xtest, ytrain, ytest = train_test_split(self.data.text, self.data.label, test_size=0.2, random_state=42,stratify=self.data.label)
        Xtrain = self.countVectorizer.fit_transform(Xtrain)
        Xtest = self.countVectorizer.transform(Xtest)

        Xtrain = pd.DataFrame(Xtrain.toarray(), columns=self.countVectorizer.get_feature_names_out())
        Xtest = pd.DataFrame(Xtest.toarray(), columns=self.countVectorizer.get_feature_names_out())

        model = BayesClassifier()
        model.fit(Xtrain.values, ytrain.values)

        self.model = model
        self.Xtest = Xtest
        self.ytest = ytest
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.__fit = True
    
    @multipledispatch.dispatch()
    def predict(self):
        return self.predict(True)
    
    @multipledispatch.dispatch(bool)
    def predict(self,test:bool=True):
        if not self.__fit:
            self.fit()
        if test:
            return self.model.predict(self.Xtest.values)
        else:
            return self.model.predict(self.Xtrain.values)
    
    @multipledispatch.dispatch(pd.DataFrame)
    def predict(self, data:pd.DataFrame):
        if not self.__fit:
            self.fit()
        data = self.countVectorizer.transform(data.text)
        data = pd.DataFrame(data.toarray(), columns=self.countVectorizer.get_feature_names_out())
        return self.model.predict(data.values)

    @multipledispatch.dispatch(str)
    def predict(self, data:str):
        if not self.__fit:
            self.fit()
        return self.model.predict(self.countVectorizer.transform([data]).toarray())
    
    def score(self, test:bool=True):
        if test:
            pred = self.predict()
            target = self.ytest
        else:
            pred = self.predict(False)
            target = self.ytrain
        return (pred == target).sum() / len(pred)



In [4]:
model = Model(data)
model.fit()
model.score()

0.1336322869955157

In [27]:
import numpy as np
np.log(10**-323)

-743.7469247408213

In [4]:
model.predict()

  likelihood = np.sum(np.log(self._gaussian_density(x, self.mean[cls], self.var[cls])))


array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'spam'], dtype='<U4')

In [148]:
# the regex for punctuation is r'[^\w\s]'
import re
split = re.compile(r'\s+')
tmp = data.text.str.replace(r'[^\w\s]','',regex=True).str.lower()
tmp.apply(lambda x: split.split(x)).explode().value_counts()[:100]

text
to       2251
i        2239
you      2128
a        1442
the      1333
         ... 
new       136
well      135
later     134
hi        133
think     132
Name: count, Length: 100, dtype: int64

In [114]:
import numpy as np

A_range = np.random.randn(1000,2)+1
B_range = np.random.randn(1000,2)+5

df = pd.DataFrame(data = A_range, columns = ['x','y'])
df['label'] = 'A'
df = pd.concat([df,pd.DataFrame(data = B_range, columns = ['x','y'])])
df.fillna('B',inplace=True)


model = BayesClassifier()
model.fit(df[['x','y']].values,df['label'].values)
df['pred'] = model.predict(df[['x','y']].values)
df['equal'] = df['label'] == df['pred']
df['equal'].sum()

df['color'] = df.apply(lambda row: 'A but should be B' if row['label'] == 'A' and row['pred'] == 'B' else 'B but should be A' if row['label'] == 'B' and row['pred'] == 'A' else row['label'],axis=1)
df.plot(backend = 'plotly',x='x',y='y',color='color',kind='scatter')
