Library Imports

In [1]:
import numpy as np
import regex as re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Dataset Loading

In [2]:
news_data = pd.read_csv('combined_articles.csv')

Data set cleaning

In [32]:
urdu_stop_words = ['ہے', 'اور', 'کو', 'میں', 'یہ', 'سے', 'کہ', 'کا', 'کی', 'ایک']

def clean_urdu_data(sentence):
    urls = r'http\S+|www\S+|https\S+' 
    non_urdu = r'[^\u0600-\u06FF\s]' 
    whitespaces = r'\s+'           
    sentence = re.sub(urls, '', sentence)
    sentence = re.sub(non_urdu, '', sentence)
    sentence = re.sub(whitespaces, ' ', sentence).strip()
    filtered_words = [word for word in sentence.split() if word not in urdu_stop_words]
    return ' '.join(filtered_words)

news_data['cleaned_content'] = news_data['content'].apply(lambda x: clean_urdu_data(x) if pd.notnull(x) else x)
news_data = news_data[news_data['cleaned_content'] != '']

Extracting X and Y and splitting test and train


In [33]:
news_data = news_data.dropna(subset=['cleaned_content'])
X = news_data['cleaned_content']
y = news_data['gold_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Bag of Words class to vectorize the content

In [34]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = [] 
        self.word_dic = {}   

    def fit(self, sentences):
        word_set = set()
        for sentence in sentences:
            words = sentence.split()
            word_set.update(words)
        self.vocabulary = sorted(word_set)
        self.word_dic = {word: i for i, word in enumerate(self.vocabulary)}

    def vectorize(self, sentence):
        vector = [0] * len(self.vocabulary) 
        words = sentence.split()
        for word in words:
            if word in self.word_dic: 
                index = self.word_dic[word]
                vector[index] += 1 
        return vector

    def transform(self, sentences):
        return [self.vectorize(sentence) for sentence in sentences]

Multinomial Class implementation

In [35]:
class MultinomialNB:
    def __init__(self):
        self.priors = {}
        self.conditional_probabilities = {}
        self.classes = []
        self.vocab_size = 0

    def calculate_priors(self, y_train):
        total = len(y_train)
        self.classes, counts = np.unique(y_train, return_counts=True)
        self.priors = {c: count / total for c, count in zip(self.classes, counts)}

    def calculate_word_counts_per_class(self, X_train, y_train):
        y_train = np.array(y_train)
        word_counts = {c: np.zeros(self.vocab_size) for c in self.classes}
        class_totals = {c: 0 for c in self.classes}
        for i, vector in enumerate(X_train):
            label = y_train[i]
            word_counts[label] += vector
            class_totals[label] += sum(vector)
        return word_counts, class_totals

    def calculate_conditional_probabilities(self, word_counts, total_word_counts):
        for c in self.classes:
            counts = word_counts[c]
            total = total_word_counts[c]
            self.conditional_probabilities[c] = (counts + 1) / (total + self.vocab_size)

    def fit(self, X_train, y_train, vocab_size):
        self.vocab_size = vocab_size
        self.calculate_priors(y_train)
        word_counts, total_counts = self.calculate_word_counts_per_class(X_train, y_train)
        self.calculate_conditional_probabilities(word_counts, total_counts)

    def predict(self, vector):
        log_probs = {}
        for c in self.classes:
            log_prob = np.log(self.priors[c])
            for i in range(self.vocab_size):
                if vector[i] != 0:
                    log_prob += vector[i] * np.log(self.conditional_probabilities[c][i])
            log_probs[c] = log_prob
        return max(log_probs, key=log_probs.get)

    def predict_batch(self, X_data):
        return [self.predict(vector) for vector in X_data]


Applying Bag-of-Words and fitting Multinomial model to train data and then predicting the test dataset

In [36]:
bow = BagOfWords()
bow.fit(X_train)
X_train_bow = bow.transform(X_train)
X_test_bow = bow.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_bow, y_train, len(bow.vocabulary))
predictions = nb.predict_batch(X_test_bow)
print(predictions)

['science-technology', 'business', 'science-technology', 'world', 'science-technology', 'science-technology', 'entertainment', 'world', 'business', 'world', 'business', 'business', 'entertainment', 'sports', 'world', 'world', 'business', 'entertainment', 'entertainment', 'business', 'business', 'business', 'entertainment', 'world', 'world', 'business', 'science-technology', 'entertainment', 'business', 'sports', 'science-technology', 'entertainment', 'business', 'world', 'entertainment', 'business', 'world', 'sports', 'science-technology', 'business', 'sports', 'business', 'world', 'sports', 'entertainment', 'business', 'science-technology', 'business', 'business', 'entertainment', 'business', 'science-technology', 'world', 'world', 'entertainment', 'world', 'science-technology', 'business', 'sports', 'world', 'science-technology', 'sports', 'world', 'sports', 'science-technology', 'world', 'entertainment', 'sports', 'entertainment', 'sports', 'sports', 'entertainment', 'entertainment'

Model accuracy and metrics

In [37]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))

Accuracy: 0.9642857142857143

Classification Report:
                    precision    recall  f1-score   support

          business       1.00      0.97      0.99        73
     entertainment       0.95      0.99      0.97        72
science-technology       0.95      0.97      0.96        60
            sports       1.00      0.98      0.99        66
             world       0.92      0.91      0.91        65

          accuracy                           0.96       336
         macro avg       0.96      0.96      0.96       336
      weighted avg       0.96      0.96      0.96       336


Confusion Matrix:
[[71  0  0  0  2]
 [ 0 71  0  0  1]
 [ 0  1 58  0  1]
 [ 0  0  0 65  1]
 [ 0  3  3  0 59]]
