In [2]:
Sports_URLs = ['https://en.wikipedia.org/wiki/2026_FIFA_World_Cup',
        'https://en.wikipedia.org/wiki/2024_ICC_Men%27s_T20_World_Cup',
        'https://en.wikipedia.org/wiki/2024_Indian_Premier_League'
]

Education_URLs = ['https://en.wikipedia.org/wiki/National_Education_Policy_2020',
        'https://en.wikipedia.org/wiki/Central_Board_of_Secondary_Education',
        'https://en.wikipedia.org/wiki/Indian_Certificate_of_Secondary_Education'
]

In [3]:
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    return text

def get_text(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    Ps = soup.find_all("p")
    text = ""
    for p in Ps:
        text += p.text.strip()
    return clean_text(text)

Sports_text = [get_text(URL) for URL in Sports_URLs]
Education_text = [get_text(URL) for URL in Education_URLs]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91720\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Prepare a dataframe with the text and the category as labels
import pandas as pd

df = pd.DataFrame({
    "text": Sports_text + Education_text,
    "category": ["Sports"]*len(Sports_text) + ["Education"]*len(Education_text)
})

df

Unnamed: 0,text,category
0,2026 fifa world cup market fifa world cup 262 ...,Sports
1,2024 icc men t20 world cup ninth edit t20 worl...,Sports
2,2024 indian premier leagu also known ipl 17 br...,Sports
3,nation educ polici india 2020 nep 2020 start u...,Education
4,central board secondari educ cbse nation level...,Education
5,indian certif secondari educ ics examin conduc...,Education


In [5]:
from collections import Counter
import numpy as np

def get_unigram_counts(texts):
    unigram_counts = Counter()
    for text in texts:
        unigram_counts.update(text.split())
    return unigram_counts

unigram_counts = get_unigram_counts(df["text"])
unigram_counts

def get_unigram_count_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

unigram_count_matrix = get_unigram_count_matrix(df["text"], unigram_counts)
unigram_count_matrix

unigram_count_df = pd.DataFrame(unigram_count_matrix, columns=unigram_counts.keys())
unigram_count_df

Unnamed: 0,2026,fifa,world,cup,market,262,23rd,quadrenni,intern,men,...,blog,download,100000,scrape,analysi,data,pattern,100,attribut,variances78
0,10.0,39.0,31.0,24.0,1.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,13.0,10.0,0.0,0.0,0.0,0.0,4.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
unigram_counts

Counter({'2026': 10,
         'fifa': 39,
         'world': 47,
         'cup': 34,
         'market': 1,
         '262': 1,
         '23rd': 1,
         'quadrenni': 1,
         'intern': 9,
         'men': 10,
         'soccer': 3,
         'championship': 1,
         'contest': 3,
         'nation': 30,
         'team': 69,
         'member': 2,
         'associ': 3,
         'tournament': 36,
         'take': 10,
         'place': 14,
         'june': 10,
         '11': 5,
         'juli': 11,
         '19': 5,
         'jointli': 1,
         'host': 55,
         '16': 7,
         'citi': 30,
         'three': 17,
         'north': 4,
         'american': 4,
         'countri': 25,
         'canada': 14,
         'mexico': 12,
         'unit': 24,
         'state': 38,
         'first': 24,
         'sinc': 6,
         '199434': 1,
         'argentina': 1,
         'defend': 5,
         'championthi': 1,
         'includ': 18,
         '48': 4,
         'expand': 5,
         '325':

In [7]:
def get_bigram_counts(texts):
    bigram_counts = Counter()
    for text in texts:
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts.update(bigrams)
    return bigram_counts

bigram_counts = get_bigram_counts(df["text"])
bigram_counts

def get_bigram_prob_matrix(texts, bigram_counts, unigram_counts):
    matrix = np.zeros((len(texts), len(bigram_counts)))
    for i, text in enumerate(texts):
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        for j, bigram in enumerate(bigram_counts):
            matrix[i, j] = bigram_counts[bigram]/unigram_counts[bigram[0]]
    return matrix

bigram_prob_matrix = get_bigram_prob_matrix(df["text"], bigram_counts, unigram_counts)
bigram_prob_matrix

bigram_prob_df = pd.DataFrame(bigram_prob_matrix, columns=bigram_counts.keys())
bigram_counts

Counter({('2026', 'fifa'): 3,
         ('fifa', 'world'): 14,
         ('world', 'cup'): 34,
         ('cup', 'market'): 1,
         ('market', 'fifa'): 1,
         ('cup', '262'): 1,
         ('262', '23rd'): 1,
         ('23rd', 'fifa'): 1,
         ('cup', 'quadrenni'): 1,
         ('quadrenni', 'intern'): 1,
         ('intern', 'men'): 1,
         ('men', 'soccer'): 1,
         ('soccer', 'championship'): 1,
         ('championship', 'contest'): 1,
         ('contest', 'nation'): 1,
         ('nation', 'team'): 3,
         ('team', 'member'): 1,
         ('member', 'associ'): 2,
         ('associ', 'fifa'): 1,
         ('fifa', 'tournament'): 1,
         ('tournament', 'take'): 1,
         ('take', 'place'): 5,
         ('place', 'june'): 4,
         ('june', '11'): 2,
         ('11', 'juli'): 1,
         ('juli', '19'): 2,
         ('19', '2026'): 1,
         ('2026', 'jointli'): 1,
         ('jointli', 'host'): 1,
         ('host', '16'): 1,
         ('16', 'citi'): 1,
         (

In [8]:
bigram_prob_df.shape

(6, 4706)

In [9]:
def get_tf_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            # print(f"{j} {word}")
            matrix[i, j] = counts[word]
    return matrix

tf_matrix = get_tf_matrix(df["text"], unigram_counts)
tf_matrix

def get_idf_vector(texts, unigram_counts):
    idf_vector = np.zeros(len(unigram_counts))
    for j, word in enumerate(unigram_counts):
        idf_vector[j] = np.log(len(texts)/sum([1 for text in texts if word in text]))
    return idf_vector

idf_vector = get_idf_vector(df["text"], unigram_counts)
idf_vector

def get_tfidf_matrix(tf_matrix, idf_vector):
    return tf_matrix*idf_vector

tfidf_matrix = get_tfidf_matrix(tf_matrix, idf_vector)
tfidf_matrix

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unigram_counts.keys())
tfidf_df

Unnamed: 0,2026,fifa,world,cup,market,262,23rd,quadrenni,intern,men,...,blog,download,100000,scrape,analysi,data,pattern,100,attribut,variances78
0,17.917595,69.878619,12.569418,26.366695,1.791759,1.098612,1.791759,1.791759,0.405465,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,5.271046,10.986123,0.0,0.0,0.0,0.0,1.62186,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.81093,0.0,0.0,0.0,0.0,0.0,0.81093,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.405465,0.0,0.0,0.0,0.0,0.0,0.81093,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759


In [10]:
tf_matrix.shape

(6, 1924)

In [11]:
import numpy as np

class CustomMultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_probs = None
        self.feature_probs = None
        self.classes = None

    def _calculate_feature_probs(self, X, y):
        num_classes = len(self.classes)
        num_features = X.shape[1]

        feature_counts = np.zeros((num_classes, num_features))
        class_counts = np.zeros(num_classes)

        for i in range(num_classes):
            class_mask = (y == self.classes[i])
            class_counts[i] = np.sum(class_mask)
            feature_counts[i, :] = np.sum(X[class_mask, :], axis=0)

        # Add alpha for smoothing
        feature_probs = (feature_counts + self.alpha) / (
            np.sum(feature_counts, axis=1)[:, np.newaxis] + self.alpha * num_features
        )

        return feature_probs, class_counts / len(y)

    def fit(self, X, y):
        self.classes, y_indices = np.unique(y, return_inverse=True)
        num_classes = len(self.classes)
        num_features = X.shape[1]

        self.class_probs = np.zeros(num_classes)
        self.feature_probs = np.zeros((num_classes, num_features))

        self.feature_probs, self.class_probs = self._calculate_feature_probs(X, y)

    def predict_proba(self, X):
        if self.feature_probs is None or self.class_probs is None:
            raise ValueError("Model not fitted")

        log_probs = np.log(self.feature_probs)
        class_log_probs = np.log(self.class_probs)

        # Calculate the log likelihood of each class for each sample
        log_likelihood = X @ log_probs.T + class_log_probs

        # Calculate the probabilities using the log-sum-exp trick
        exp_log_likelihood = np.exp(log_likelihood - np.max(log_likelihood, axis=1)[:, np.newaxis])
        probs = exp_log_likelihood / np.sum(exp_log_likelihood, axis=1)[:, np.newaxis]

        return probs

    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]


In [12]:
def custom_accuracy_score(y_true, y_pred):
    """
    Calculate accuracy score.

    Parameters:
    - y_true: true class labels
    - y_pred: predicted class labels

    Returns:
    - Accuracy score
    """
    correct_predictions = np.sum(y_true == y_pred)
    total_samples = len(y_true)
    
    accuracy = correct_predictions / total_samples
    return accuracy

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = CustomMultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [20]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = CustomMultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.0

In [16]:
import numpy as np
from scipy.stats import norm

class CustomGaussianNB:
    def __init__(self):
        self.class_probs = None
        self.class_params = None
        self.classes = None

    def fit(self, X, y):
        self.classes, y_indices = np.unique(y, return_inverse=True)
        num_classes = len(self.classes)
        num_features = X.shape[1]

        self.class_probs = np.zeros(num_classes)
        self.class_params = np.zeros((num_classes, 2, num_features))  # 2 parameters for mean and variance

        for i in range(num_classes):
            class_mask = (y == self.classes[i])
            self.class_probs[i] = np.mean(class_mask)
            self.class_params[i, 0, :] = np.mean(X[class_mask, :], axis=0)
            self.class_params[i, 1, :] = np.var(X[class_mask, :], axis=0)
    def predict_proba(self, X):
        if self.class_probs is None or self.class_params is None:
            raise ValueError("Model not fitted")

        num_samples, num_features = X.shape
        num_classes = len(self.classes)

        log_probs = np.zeros((num_samples, num_classes))

        for i in range(num_classes):
            log_probs[:, i] = (
                np.log(self.class_probs[i])
                - 0.5 * np.sum(np.log(2 * np.pi * (self.class_params[i, 1, :] + 1e-10)))  # Add a small constant
                - 0.5 * np.sum(((X - self.class_params[i, 0, :]) ** 2) / (self.class_params[i, 1, :] + 1e-10), axis=1)
            )

        # Calculate the probabilities using the log-sum-exp trick
        exp_log_probs = np.exp(log_probs - np.max(log_probs, axis=1)[:, np.newaxis])
        probs = exp_log_probs / np.sum(exp_log_probs, axis=1)[:, np.newaxis]

        return probs

    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
model = CustomGaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
accuracy_score(y_train, y_pred)

1.0