In [1]:
Sports_URLs = ['https://en.wikipedia.org/wiki/Football',
        'https://en.wikipedia.org/wiki/Cricket',
        'https://en.wikipedia.org/wiki/Badminton',
        'https://en.wikipedia.org/wiki/Basketball',
        'https://en.wikipedia.org/wiki/Hockey']

Education_URLs = ['https://en.wikipedia.org/wiki/School',
        'https://en.wikipedia.org/wiki/College',
        'https://en.wikipedia.org/wiki/University',
        'https://en.wikipedia.org/wiki/Professor',
        'https://en.wikipedia.org/wiki/Teacher']

In [2]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    return text

def get_text(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    Ps = soup.find_all("p")
    text = ""
    for p in Ps:
        text += p.text.strip()
    return clean_text(text)

Sports_text = [get_text(URL) for URL in Sports_URLs]
Education_text = [get_text(URL) for URL in Education_URLs]

In [22]:
# Sports_text

In [23]:
# Education_text

In [3]:
# Prepare a dataframe with the text and the category as labels
import pandas as pd

df = pd.DataFrame({
    "text": Sports_text + Education_text,
    "category": ["Sports"]*len(Sports_text) + ["Education"]*len(Education_text)
})

df

Unnamed: 0,text,category
0,footbal famili team sport involv vari degre ki...,Sports
1,firstclass cricketon day internationallimit ov...,Sports
2,badminton racquet sport play use racquet hit s...,Sports
3,basketbal team sport two team commonli five pl...,Sports
4,hockey term use denot famili variou type summe...,Sports
5,school educ institut build design provid learn...,Education
6,colleg latin collegium educ institut constitu ...,Education
7,univers latin universita whole institut higher...,Education
8,professor commonli abbrevi prof1 academ rank u...,Education
9,teacher also call schoolteach formal educ pers...,Education


In [4]:
from collections import Counter
import numpy as np

def get_unigram_counts(texts):
    unigram_counts = Counter()
    for text in texts:
        unigram_counts.update(text.split())
    return unigram_counts

unigram_counts = get_unigram_counts(df["text"])
unigram_counts

def get_unigram_count_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

unigram_count_matrix = get_unigram_count_matrix(df["text"], unigram_counts)
unigram_count_matrix

unigram_count_df = pd.DataFrame(unigram_count_matrix, columns=unigram_counts.keys())
unigram_count_df

Unnamed: 0,footbal,famili,team,sport,involv,vari,degre,kick,ball,score,...,processknow,nay,nonautocrat,guardian,cherish,ancestor,succeed,never,realli,nurtur
0,211.0,1.0,23.0,30.0,9.0,4.0,2.0,30.0,92.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.0,0.0,61.0,18.0,4.0,0.0,0.0,0.0,64.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,10.0,19.0,3.0,0.0,0.0,0.0,8.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,88.0,22.0,4.0,6.0,1.0,2.0,104.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,11.0,30.0,3.0,2.0,0.0,0.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,1.0,5.0,50.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,2.0,3.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,6.0,5.0,12.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
unigram_counts

Counter({'school': 463,
         'univers': 447,
         'colleg': 393,
         'play': 310,
         'ball': 285,
         'game': 278,
         'player': 275,
         'footbal': 225,
         'educ': 211,
         'use': 202,
         'team': 194,
         'teacher': 176,
         'first': 170,
         'also': 167,
         'institut': 162,
         'one': 151,
         'student': 151,
         'may': 149,
         'rule': 142,
         'cricket': 128,
         'state': 123,
         'sport': 120,
         'two': 112,
         'mani': 111,
         'includ': 109,
         'professor': 108,
         'call': 106,
         'hockey': 103,
         'refer': 102,
         'basketbal': 102,
         'nation': 100,
         'match': 98,
         'form': 95,
         'high': 94,
         'often': 92,
         'new': 91,
         'term': 91,
         'teach': 90,
         'known': 89,
         'unit': 88,
         'intern': 88,
         'court': 86,
         'shot': 86,
         'associ': 

In [5]:
def get_bigram_counts(texts):
    bigram_counts = Counter()
    for text in texts:
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts.update(bigrams)
    return bigram_counts

bigram_counts = get_bigram_counts(df["text"])
bigram_counts

def get_bigram_prob_matrix(texts, bigram_counts, unigram_counts):
    matrix = np.zeros((len(texts), len(bigram_counts)))
    for i, text in enumerate(texts):
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        for j, bigram in enumerate(bigram_counts):
            matrix[i, j] = bigram_counts[bigram]/unigram_counts[bigram[0]]
    return matrix

bigram_prob_matrix = get_bigram_prob_matrix(df["text"], bigram_counts, unigram_counts)
bigram_prob_matrix

bigram_prob_df = pd.DataFrame(bigram_prob_matrix, columns=bigram_counts.keys())
bigram_counts

Counter({('unit', 'state'): 50,
         ('high', 'school'): 50,
         ('secondari', 'school'): 46,
         ('higher', 'educ'): 42,
         ('game', 'play'): 33,
         ('public', 'school'): 28,
         ('univers', 'colleg'): 27,
         ('colleg', 'univers'): 26,
         ('new', 'zealand'): 25,
         ('ice', 'hockey'): 25,
         ('educ', 'institut'): 25,
         ('associ', 'footbal'): 23,
         ('19th', 'centuri'): 23,
         ('term', 'colleg'): 21,
         ('unit', 'kingdom'): 20,
         ('limit', 'over'): 18,
         ('world', 'cup'): 17,
         ('rugbi', 'footbal'): 16,
         ('footbal', 'club'): 16,
         ('privat', 'school'): 15,
         ('free', 'throw'): 15,
         ('bachelor', 'degre'): 15,
         ('rule', 'footbal'): 14,
         ('ball', 'game'): 14,
         ('may', 'also'): 14,
         ('word', 'colleg'): 14,
         ('american', 'footbal'): 13,
         ('servic', 'court'): 13,
         ('also', 'use'): 13,
         ('institut', 'h

In [28]:
bigram_prob_df.shape

(10, 28150)

In [7]:
def get_tf_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            # print(f"{j} {word}")
            matrix[i, j] = counts[word]
    return matrix

tf_matrix = get_tf_matrix(df["text"], unigram_counts)
tf_matrix

def get_idf_vector(texts, unigram_counts):
    idf_vector = np.zeros(len(unigram_counts))
    for j, word in enumerate(unigram_counts):
        idf_vector[j] = np.log(len(texts)/sum([1 for text in texts if word in text]))
    return idf_vector

idf_vector = get_idf_vector(df["text"], unigram_counts)
idf_vector

def get_tfidf_matrix(tf_matrix, idf_vector):
    return tf_matrix*idf_vector

tfidf_matrix = get_tfidf_matrix(tf_matrix, idf_vector)
tfidf_matrix

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unigram_counts.keys())
tfidf_df

Unnamed: 0,footbal,famili,team,sport,involv,vari,degre,kick,ball,score,...,processknow,nay,nonautocrat,guardian,cherish,ancestor,succeed,never,realli,nurtur
0,146.254055,0.693147,11.748989,10.700248,2.008292,0.0,0.71335,48.283137,46.995957,2.14005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.158883,0.0,31.160363,6.420149,0.892574,0.0,0.0,0.0,32.69284,11.056923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.693147,0.0,5.108256,6.776824,0.669431,0.0,0.0,0.0,4.086605,3.210074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.465736,0.0,44.952655,7.846849,0.892574,0.0,0.356675,3.218876,53.125865,5.350124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.386294,0.693147,5.619082,10.700248,0.669431,0.0,0.0,0.0,8.684036,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.386294,0.0,0.0,0.0,0.0,1.070025,0.0,0.0,0.356675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.356675,0.223144,0.0,17.833747,0.0,0.0,0.356675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.693147,0.0,0.0,0.446287,0.0,3.923424,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.510826,0.0,0.0,0.0,0.71335,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.693147,0.0,0.0,1.338861,0.0,4.280099,0.0,0.0,0.356675,...,2.302585,2.302585,2.302585,2.302585,2.302585,2.302585,2.302585,0.916291,2.302585,2.302585


In [30]:
tf_matrix.shape

(10, 6440)

In [14]:
import numpy as np

class ManualMultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_probs = None
        self.feature_probs = None
        self.classes = None

    def _calculate_feature_probs(self, X, y):
        num_classes = len(self.classes)
        num_features = X.shape[1]

        feature_counts = np.zeros((num_classes, num_features))
        class_counts = np.zeros(num_classes)

        for i in range(num_classes):
            class_mask = (y == self.classes[i])
            class_counts[i] = np.sum(class_mask)
            feature_counts[i, :] = np.sum(X[class_mask, :], axis=0)

        # Add alpha for smoothing
        feature_probs = (feature_counts + self.alpha) / (
            np.sum(feature_counts, axis=1)[:, np.newaxis] + self.alpha * num_features
        )

        return feature_probs, class_counts / len(y)

    def fit(self, X, y):
        self.classes, y_indices = np.unique(y, return_inverse=True)
        num_classes = len(self.classes)
        num_features = X.shape[1]

        self.class_probs = np.zeros(num_classes)
        self.feature_probs = np.zeros((num_classes, num_features))

        self.feature_probs, self.class_probs = self._calculate_feature_probs(X, y)

    def predict_proba(self, X):
        if self.feature_probs is None or self.class_probs is None:
            raise ValueError("Model not fitted")

        log_probs = np.log(self.feature_probs)
        class_log_probs = np.log(self.class_probs)

        # Calculate the log likelihood of each class for each sample
        log_likelihood = X @ log_probs.T + class_log_probs

        # Calculate the probabilities using the log-sum-exp trick
        exp_log_likelihood = np.exp(log_likelihood - np.max(log_likelihood, axis=1)[:, np.newaxis])
        probs = exp_log_likelihood / np.sum(exp_log_likelihood, axis=1)[:, np.newaxis]

        return probs

    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]


In [15]:
def accuracy_score(y_true, y_pred):
    """
    Calculate accuracy score.

    Parameters:
    - y_true: true class labels
    - y_pred: predicted class labels

    Returns:
    - Accuracy score
    """
    correct_predictions = np.sum(y_true == y_pred)
    total_samples = len(y_true)
    
    accuracy = correct_predictions / total_samples
    return accuracy

In [20]:
#custom train test split
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)
    # indices = np.arange(len(y))
    indices = np.arange(len(y))
    np.random.shuffle(indices)

    test_samples = int(test_size*len(y))

    X_train = X[indices[test_samples:]]
    X_test = X[indices[:test_samples]]
    y_train = y[indices[test_samples:]]
    y_test = y[indices[:test_samples]]

    return X_train, X_test, y_train, y_test

In [21]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = ManualMultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [17]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = ManualMultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [18]:
import numpy as np
from scipy.stats import norm

class ManualGaussianNB:
    def __init__(self):
        self.class_probs = None
        self.class_params = None
        self.classes = None

    def fit(self, X, y):
        self.classes, y_indices = np.unique(y, return_inverse=True)
        num_classes = len(self.classes)
        num_features = X.shape[1]

        self.class_probs = np.zeros(num_classes)
        self.class_params = np.zeros((num_classes, 2, num_features))  # 2 parameters for mean and variance

        for i in range(num_classes):
            class_mask = (y == self.classes[i])
            self.class_probs[i] = np.mean(class_mask)
            self.class_params[i, 0, :] = np.mean(X[class_mask, :], axis=0)
            self.class_params[i, 1, :] = np.var(X[class_mask, :], axis=0)
    def predict_proba(self, X):
        if self.class_probs is None or self.class_params is None:
            raise ValueError("Model not fitted")

        num_samples, num_features = X.shape
        num_classes = len(self.classes)

        log_probs = np.zeros((num_samples, num_classes))

        for i in range(num_classes):
            log_probs[:, i] = (
                np.log(self.class_probs[i])
                - 0.5 * np.sum(np.log(2 * np.pi * (self.class_params[i, 1, :] + 1e-10)))  # Add a small constant
                - 0.5 * np.sum(((X - self.class_params[i, 0, :]) ** 2) / (self.class_params[i, 1, :] + 1e-10), axis=1)
            )

        # Calculate the probabilities using the log-sum-exp trick
        exp_log_probs = np.exp(log_probs - np.max(log_probs, axis=1)[:, np.newaxis])
        probs = exp_log_probs / np.sum(exp_log_probs, axis=1)[:, np.newaxis]

        return probs

    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

model = ManualGaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0