<a href="https://colab.research.google.com/github/Ansh-agrawal12/IIIT-HBD-MODULES/blob/main/Module_8_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install numpy pandas pdfreader
import typing

# Import libraries
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from pdfreader import PDFDocument, SimplePDFViewer, document

# Define functions

# Function to clean text
def clean_text(text: str):
    """
    Given text it removes all the non-character words, small words,
    converts everything to small letters, tokenizes and returns as a list.
    :param text: The text to be cleaned
    """
    text = text.lower()
    text = re.sub("[^a-z]", " ", text)
    data = text.split()
    data = list(filter(lambda x: len(x) >= WORD_LENGTH_THRESHOLD, data))
    return data

# Function to parse text from PDF
def parse_pdf(filename: str):
    """
    Read text from a PDF file.
    Clean the text, tokenize it, and return as a list of tokens.
    :param :
    """
    fd = open(filename, "rb")
    document = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    output_strings = []
    for i in range(len(list(document.pages()))):
        viewer.navigate(1)
        viewer.render()
        output_strings.extend(viewer.canvas.strings)
    file_contents = " ".join(output_strings)
    return clean_text(file_contents)

# Function to parse resume DataFrame
def parse_resume_df():
    resume_df = pd.read_csv("/content/resume-dataset.csv")
    resume_df["Keywords"] = resume_df["Resume"].apply(clean_text)
    return resume_df["Keywords"].values, resume_df["Category"].values

# Define classes

# Bag of Words class
class BagOfWords:
    """
    A type of encoder, makes
    """

    def __init__(self, data: typing.Iterable) -> None:
        """
        Generate the bag of words
        :param data: an array of words, or an iterable containing arrays of words
        """
        data = np.array(self.__linearize_array(data))
        self.index_to_words = np.unique(data)
        self.words_to_index = {w: i for i, w in enumerate(self.index_to_words)}

    @classmethod
    def __linearize_array(cls, text):
        x = []
        for item in text:
            if isinstance(item, str):
                x.append(item)
            else:
                x.extend(cls.__linearize_array(item))
        return x

    def __call__(self, text: typing.Iterable[str]) -> np.array:
        return self.get_counts(text)

    def __len__(self) -> int:
        return len(self.index_to_words)

    def encode_data(
        self: "BagOfWords",
        text: typing.Union[typing.Iterable[str], typing.Iterable[typing.Iterable[str]]],
    ) -> np.array:
        """
        Compute the encodings of words in a new input tokenized string
        """
        x = []
        for item in text:
            if isinstance(item, str):
                if item in self.words_to_index:
                    x.append(self.words_to_index[item])
            else:
                x.append(self.encode_data(item))
        return x

    def decode_data(self: "BagOfWords", encoded_text: typing.Iterable[int]):
        if isinstance(encoded_text, int) or isinstance(encoded_text, np.int64):
            return self.index_to_words[encoded_text]
        else:
            return list(map(self.decode_data, encoded_text))

    def get_counts(
        self: "BagOfWords",
        text: typing.Union[typing.Iterable[str], typing.Iterable[typing.Iterable[str]]],
    ):
        """
        Computes the counts of words in a new input tokenized string
        """
        if len(text) == 0 or isinstance(text[0], str):
            x = np.zeros(shape=len(self))
            for word in text:
                if word in self.words_to_index:
                    x[self.words_to_index[word]] += 1
            return x
        else:
            return np.stack([self.get_counts(item) for item in text], axis=0)

# Label Encoder class
class LabelEncoder:
    """
    Label encode a series of labels
    """

    def __init__(self, data) -> None:
        self.__training_data = data
        self.index_to_token = list(set(data))
        self.token_to_index = {
            token: index for index, token in enumerate(self.index_to_token)
        }

    def __len__(self):
        return len(self.token_to_index)

    @property
    def encoded_data(self):
        return np.array([self.token_to_index[token] for token in self.__training_data])

    def encode(self, data):
        return np.array([self.token_to_index[token] for token in data])

    def decode(self, data):
        if isinstance(data, int) or isinstance(data, np.int64):
            return self.index_to_token[data]
        else:
            return np.array([self.index_to_token[index] for index in data])

# Bayesian Multiclass Model class
class BayesianMulticlassModel:
    """
    A multi-class bayesian classfier from encoded text tokens
    """

    def __init__(self, num_classes, num_tokens) -> None:
        self.counts = np.zeros(shape=(num_classes, num_tokens))

    def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
        for x, y in zip(x_train, y_train):
            self.counts[y] += x

    def predict(self, counts_vector):
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        likelihood = np.multiply(likelihood, counts_vector)
        prior = np.expand_dims(prior, axis=1)

        posterior_marginal = prior * likelihood / evidence + 0.00001
        posterior_joint = np.sum(np.log(posterior_marginal), axis=1)
        return np.flip(np.argsort(posterior_joint))

# Bayesian Model Explainer class
class BayesianModelExplainer(BayesianMulticlassModel):
    """
    Explainer of the decision made by the base model
    """

    def __init__(self, label_encoder: LabelEncoder, bag_of_words: BagOfWords) -> None:
        super().__init__(len(label_encoder), len(bag_of_words))
        self.bag_of_words = bag_of_words
        self.label_encoder = label_encoder

    def explain(self, text=None, label_filter=None):
        """
        Visualize what are the prior probabilities of classes and which words
        add the the likelihood of each class.
        """
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        if text is not None:
            counts_vector = self.bag_of_words.get_counts(text)
            likelihood = np.multiply(likelihood, counts_vector)

        prior_ordering = np.flip(np.argsort(prior))
        for item in prior_ordering:
            likelihood = likelihood / (evidence + 0.00001)
            label = self.label_encoder.decode(item)
            word_ids = np.flip(np.argsort(likelihood[item]))
            word_ids = word_ids[:10]
            if label_filter is None or label in label_filter:
                print(f"{label}: {' '.join(self.bag_of_words.decode_data(word_ids))}")

# Main code

# Set hyperparameters
WORD_LENGTH_THRESHOLD = 3

# Parse resume data
x_train, y_train = parse_resume_df()

# Create Bag of Words representation
bag_of_words = BagOfWords(x_train)

# Create Label Encoder
label_encoder = LabelEncoder(y_train)

# Encode training data
x_train_encoded = bag_of_words.encode_data(x_train)
y_train_encoded = label_encoder.encode(y_train)

# Initialize and train Bayesian Multiclass Model
model = BayesianMulticlassModel(len(label_encoder), len(bag_of_words))
model.fit(x_train_encoded, y_train_encoded)

# Parse PDF file
pdf_filename = "data/resumes/computers_2.pdf"
x_test = parse_pdf(pdf_filename)

# Encode test data
x_test_encoded = bag_of_words.get_counts(x_test)

# Make predictions
predictions = model.predict(x_test_encoded)

# Decode predictions
decoded_predictions = label_encoder.decode(predictions)

# Print top predicted job categories
for job in decoded_predictions[:5]:
    print(job)

# Initialize and fit Bayesian Model Explainer
explainable_model = BayesianModelExplainer(label_encoder, bag_of_words)
explainable_model.fit(x_train=x_train_encoded, y_train=y_train_encoded)

# Explain trained prior
print("\nANALYSIS OF TRAINED PRIOR")
print("-------------------------")
explainable_model.explain()

# Explain trained evidence
print("\nANALYSIS OF TRAINED EVIDENCE")
print("----------------------------")
explainable_model.explain(x_test)



Collecting pdfreader
  Downloading pdfreader-0.1.15-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray>=1.1.0 (from pdfreader)
  Downloading bitarray-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.3/288.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodome>=3.9.9 (from pdfreader)
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

ModuleNotFoundError: No module named 'pdfreader'