This notebook aims is to build a Naïve Bayes classification model that is able to predict the class (category) of an unseen document correctly out of 91 classes.

Importing Libraries

In [1]:
import os
import re
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from collections import defaultdict

Loading data

In [2]:
data_path = "D:/4th year 2nd semester/NLP/Assignment-1 Text Classification/training"

# Function to load data from folders
def load_data(data_path):
    documents = []
    classes = []
    
    for label in os.listdir(data_path):
        label_path = os.path.join(data_path, label)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    documents.append(content)
                    classes.append(label) 
    
    return documents, classes

Spliitting Data

In [7]:
from sklearn.model_selection import train_test_split

# Load data
documents, classes = load_data(data_path)

# Identify classes with only one file
unique_classes = set(classes)
classes_with_one_file = [label for label in unique_classes if classes.count(label) == 1]

# Split data into training and testing sets excluding classes with one file
remaining_documents, remaining_classes = zip(*[
    (doc, cls) for doc, cls in zip(documents, classes) if cls not in classes_with_one_file
])

# Split remaining data into training and testing sets
train_documents, test_documents, train_classes, test_classes = train_test_split(
    remaining_documents, remaining_classes, test_size=0.2, stratify=remaining_classes
)

# Include all instances of classes with one file in the training set
for label in classes_with_one_file:
    index = [i for i, cls in enumerate(classes) if cls == label][0]
    train_documents.append(documents[index])
    train_classes.append(label)
    test_documents = [doc for doc, cls in zip(test_documents, test_classes) if cls != label]
    test_classes = [cls for cls in test_classes if cls != label]


In [8]:
print(len(documents),len(set(classes)))


11413 91


Text preprocessing

In [9]:
def preprocess_and_extract_vocabulary_ngrams(docs, stop_words=None, stemmer=None, ngram=1):
    if stop_words is None:
        stop_words = set(stopwords.words('english'))
    
    preprocessed_documents = []
    vocabulary = set()

    if stemmer is None:
        stemmer = PorterStemmer()

    for doc in docs:
        # Remove non-alphanumeric characters
        text = re.sub(r'\W', ' ', doc)
        # Tokenization with n-grams
        tokens = nltk.word_tokenize(text)

        if ngram > 1:
            tokens = list(nltk.ngrams(tokens, ngram))
        
        # Remove non-alphabetic tokens, remove stop words, and apply stemming
        tokens = [stemmer.stem(word.lower()) if isinstance(word, str) else ' '.join(map(stemmer.stem, word)) for word in tokens if (isinstance(word, str) and word.isalpha()) or ngram > 1 and all(w.isalpha() for w in word) and ' '.join(word).lower() not in stop_words]
        preprocessed_documents.append(tokens)
        vocabulary.update(tokens)
   
    return preprocessed_documents, vocabulary
train_preprocessed_documents, train_vocabulary = preprocess_and_extract_vocabulary_ngrams(train_documents)
test_preprocessed_documents, test_vocabulary = preprocess_and_extract_vocabulary_ngrams(test_documents)

In [10]:
print(len(train_preprocessed_documents), len(train_vocabulary))
print(len(test_preprocessed_documents), len(test_vocabulary))


9131 18875
2282 10324


Estimate model parameters:

In [11]:
def estimate_probabilities(preprocessed_documents, classes, vocabulary, alpha=1, ngrams=1):
    prior_probs = {}
    total_documents = len(classes)
    
    for label in set(classes):
        prior_probs[label] = classes.count(label) / total_documents

    likelihood_probs = {}

    for label in set(classes):
        class_docs = [preprocessed_documents[i] for i in range(total_documents) if classes[i] == label]
        word_count = sum(len(doc) for doc in class_docs)
        
        word_probs = {}
        if ngrams == 1:
            for word in vocabulary:
                # Numerator
                word_freq = sum(doc.count(word) for doc in class_docs)
                # Prob (w | c):
                word_probs[word] = (word_freq + alpha) / (word_count + alpha * len(vocabulary))
        else:
            # For n-grams
            ngram_counts = defaultdict(int)
            for doc in class_docs:
                for ngram in doc:
                    ngram_counts[ngram] += 1

            for ngram in vocabulary:
                # Numerator
                ngram_freq = ngram_counts[ngram]
                # Prob (w | c):
                word_probs[ngram] = (ngram_freq + alpha) / (word_count + alpha * len(vocabulary))

        likelihood_probs[label] = word_probs

    return prior_probs, likelihood_probs


In [12]:

train_prior_probs, train_likelihood_probs = estimate_probabilities(train_preprocessed_documents, train_classes, train_vocabulary)

Document Classification

In [13]:
#Function to classify a document using Naïve Bayes
def classify_document(document):
    class_scores = {}
    
    for label in set(classes):
        prior_prob = np.log(train_prior_probs[label])
        likelihood_prob = sum(np.log(train_likelihood_probs[label].get(word, 1)) for word in document)
        class_scores[label] = prior_prob + likelihood_prob
    return max(class_scores, key=class_scores.get)

Model evaluation:

unigram

In [14]:
# Predict labels for test set
predicted_labels = [classify_document(doc) for doc in test_preprocessed_documents]

# Calculate F1-score
macro_f1 = f1_score(test_classes, predicted_labels, average='macro')

# Print the Macro-averaged F1-score
print("Macro-averaged F1-score:", macro_f1)


Macro-averaged F1-score: 0.16504060408199062


ngrams (bi + tri)

In [15]:
train_preprocessed_documents_tri, train_vocabulary_tri = preprocess_and_extract_vocabulary_ngrams(train_documents, ngram=3)
test_preprocessed_documents_tri, test_vocabulary_tri = preprocess_and_extract_vocabulary_ngrams(test_documents, ngram=3)

train_preprocessed_documents_bi, train_vocabulary_bi = preprocess_and_extract_vocabulary_ngrams(train_documents, ngram=2)
test_preprocessed_documents_bi, test_vocabulary_bi = preprocess_and_extract_vocabulary_ngrams(test_documents, ngram=2)

In [17]:
train_prior_probs_trigrams, train_likelihood_probs_trigrams = estimate_probabilities(train_preprocessed_documents_tri, train_classes, train_vocabulary_tri, ngrams=3)
train_prior_probs_bigrams, train_likelihood_probs_bigrams = estimate_probabilities(train_preprocessed_documents_bi, train_classes, train_vocabulary_bi, ngrams=2)


In [18]:
# Function to classify a document using Naïve Bayes
def classify_document_ngrams(document, ngram):
    class_scores = {}
    
    if ngram ==2:
        for label in set(classes):
            prior_prob = np.log(train_prior_probs_bigrams[label])
            train_likelihood_probs_bi = sum(np.log(train_likelihood_probs_bigrams[label].get(word, 1)) for word in document)
            class_scores[label] = prior_prob + train_likelihood_probs_bi
        return max(class_scores, key=class_scores.get)
    else:
        for label in set(classes):
            prior_prob = np.log(train_prior_probs_trigrams[label])
            train_likelihood_probs_tri = sum(np.log(train_likelihood_probs_trigrams[label].get(word, 1)) for word in document)
            class_scores[label] = prior_prob + train_likelihood_probs_tri
        return max(class_scores, key=class_scores.get)

# Predict labels for the test set
predicted_labels_bi = [classify_document_ngrams(doc, 2) for doc in test_preprocessed_documents_bi]

# Calculate F1-score
macro_f1_bi = f1_score(test_classes, predicted_labels_bi, average='macro')

# Print the Macro-averaged F1-score
print("Macro-averaged F1-score using bigrams:", macro_f1_bi)


# Predict labels for the test set
predicted_labels_tri = [classify_document_ngrams(doc, 3) for doc in test_preprocessed_documents_tri]

# Calculate F1-score
macro_f1_tri = f1_score(test_classes, predicted_labels_tri, average='macro')

# Print the Macro-averaged F1-score
print("Macro-averaged F1-score using trigrams:", macro_f1_tri)



Macro-averaged F1-score using bigrams: 0.17587406643090847
Macro-averaged F1-score using trigrams: 0.19623392513154234
