In [161]:
from os import listdir
import numpy as np
import pandas as pd

# preparing train and test document paths to reading the texts

In [162]:
dataset_path = 'Dataset/Classification-Train And Test/'
train_path = dataset_path + 'Train/'
test_path  = dataset_path + 'Test/'

In [163]:
def get_files_list(directory_path):
    rows = pd.DataFrame(columns=['path', 'class'])
    for class_path_name in listdir(directory_path):
        for data_file in listdir(directory_path + class_path_name):
            single_row = pd.DataFrame ({
                'path' : [directory_path + class_path_name + '/' + data_file],
                'class' : [class_path_name]
            })
            rows = pd.concat([rows, single_row], ignore_index=True)
    return rows

In [164]:
doc_train = get_files_list(train_path)
doc_test  = get_files_list(test_path)

In [165]:
doc_train

Unnamed: 0,path,class
0,Dataset/Classification-Train And Test/Train/Co...,Comp.graphics
1,Dataset/Classification-Train And Test/Train/Co...,Comp.graphics
2,Dataset/Classification-Train And Test/Train/Co...,Comp.graphics
3,Dataset/Classification-Train And Test/Train/Co...,Comp.graphics
4,Dataset/Classification-Train And Test/Train/Co...,Comp.graphics
...,...,...
89,Dataset/Classification-Train And Test/Train/ta...,talk.politics.mideast
90,Dataset/Classification-Train And Test/Train/ta...,talk.politics.mideast
91,Dataset/Classification-Train And Test/Train/ta...,talk.politics.mideast
92,Dataset/Classification-Train And Test/Train/ta...,talk.politics.mideast


In [166]:
doc_test

Unnamed: 0,path,class
0,Dataset/Classification-Train And Test/Test/Com...,Comp.graphics
1,Dataset/Classification-Train And Test/Test/Com...,Comp.graphics
2,Dataset/Classification-Train And Test/Test/rec...,rec.autos
3,Dataset/Classification-Train And Test/Test/rec...,rec.autos
4,Dataset/Classification-Train And Test/Test/rec...,rec.autos
5,Dataset/Classification-Train And Test/Test/sci...,sci.electronics
6,Dataset/Classification-Train And Test/Test/sci...,sci.electronics
7,Dataset/Classification-Train And Test/Test/sci...,sci.electronics
8,Dataset/Classification-Train And Test/Test/soc...,soc.religion.christian
9,Dataset/Classification-Train And Test/Test/soc...,soc.religion.christian


# Reading documents and converting them to a list of words for each class

## function to convert a sentence into list of words

In [167]:
stopwords = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
             'at',
             'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
             'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't",
             'down', 'during',
             'each', 'few', 'for', 'from', 'further',
             'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her',
             'here', "here's",
             'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
             'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
             "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
             'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves',
             'out', 'over', 'own',
             'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such',
             'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's",
             'these', 'they', "they'd",
             "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very',
             'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when',
             "when's", 'where',
             "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'will', 'with', "won't", 'would',
             "wouldn't",
             'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves',
             'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand',
             '1st', '2nd', '3rd',
             '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [168]:
def remove_stopwords(words):
    words = [word for word in words if not word in stopwords]
    return words

In [169]:
def tokenize_sentence(line):
    words = line.strip().split(" ")
    words = remove_stopwords(words)
    return words

## function to open and read a document and convert it into a list of sentences (lines)

In [170]:
def tokenize(document_path):
    f = open(document_path, 'r')
    text_lines_in_given_document = f.readlines()

    words_in_given_document = []

    #traverse over all the lines and tokenize each one with the help of helper function: tokenize_sentence
    for line in text_lines_in_given_document:
        words_in_given_document.append(tokenize_sentence(line))
    return words_in_given_document[0]

## update the dataset (add the list of words in each train document)

In [171]:
def get_bag_of_words(words):
    words, frequency = np.unique(np.array(words), return_counts=True)
    return dict(zip(words, frequency))

In [172]:
result_df = pd.DataFrame(columns=['class', 'words'], )
class_words_dict = {}
for class_name in doc_train['class'].unique():
    # Filter the original DataFrame for the current class
    class_data = doc_train[doc_train['class'] == class_name]

    # Use the get_words_from_file function to get a list of words for each file in the current class
    words_list = [tokenize(file_path) for file_path in class_data['path']]

    # Accumulate all words for the current class
    class_words_dict[class_name] = [word for words in words_list for word in words]

# Create the result DataFrame from the accumulated words
result_df = pd.DataFrame({'class': list(class_words_dict.keys()), 'words': np.asarray(class_words_dict.values())})

result_df['bag_of_words'] = result_df['words'].apply(lambda words : get_bag_of_words(words))
result_df

Unnamed: 0,class,words,bag_of_words
0,Comp.graphics,"[comp.graphics\tdmpeg, zip, info, another, dos...","{'abc': 1, 'able': 8, 'abstracts': 1, 'accept'..."
1,rec.autos,"[rec.autos\tre, integra, gsr, article, apr, ns...","{'aardvark': 1, 'abbreviations': 2, 'able': 7,..."
2,sci.electronics,"[sci.electronics\tradar, detector, detectors, ...","{'aaahh': 1, 'aalternate': 1, 'ab': 2, 'abando..."
3,soc.religion.christian,"[soc.religion.christian\tweirdness, early, chr...","{'abandon': 1, 'abandoned': 2, 'ability': 4, '..."
4,talk.politics.mideast,"[talk.politics.mideast\tit, sickening, think, ...","{'ababs': 1, 'abduction': 1, 'abdulhamid': 1, ..."


In [173]:
count_of_documents = []
for class_name in doc_train['class'].unique():
    count_of_documents.append(
        doc_train.loc[doc_train['class'] == class_name].count()['class']
    )
result_df['count_of_documents'] = count_of_documents
result_df

Unnamed: 0,class,words,bag_of_words,count_of_documents
0,Comp.graphics,"[comp.graphics\tdmpeg, zip, info, another, dos...","{'abc': 1, 'able': 8, 'abstracts': 1, 'accept'...",8
1,rec.autos,"[rec.autos\tre, integra, gsr, article, apr, ns...","{'aardvark': 1, 'abbreviations': 2, 'able': 7,...",15
2,sci.electronics,"[sci.electronics\tradar, detector, detectors, ...","{'aaahh': 1, 'aalternate': 1, 'ab': 2, 'abando...",14
3,soc.religion.christian,"[soc.religion.christian\tweirdness, early, chr...","{'abandon': 1, 'abandoned': 2, 'ability': 4, '...",27
4,talk.politics.mideast,"[talk.politics.mideast\tit, sickening, think, ...","{'ababs': 1, 'abduction': 1, 'abdulhamid': 1, ...",30


In [174]:
total_types = set()
for bag_of_words_of_this_class in result_df['bag_of_words']:
    total_types = total_types.union(set(bag_of_words_of_this_class.keys()))
len(total_types)

12815

In [175]:
def predict(words_in_a_test_document, show_probabilities=False):
    count_of_all_train_documents = doc_train.count()['class']
    final_probabilities = {}
    for class_namee in doc_train['class'].unique():
        probabilities = {}
        class_properties = result_df.loc[result_df['class'] == class_namee].iloc[0]
        bag_of_words_of_this_class = class_properties['bag_of_words']
        count_of_documents_with_this_class = class_properties['count_of_documents']
        probability_of_this_class = count_of_documents_with_this_class / count_of_all_train_documents
        probabilities['P('+class_namee+')'] = probability_of_this_class
        for test_word in words_in_a_test_document:
            if test_word not in total_types:
                continue
            elif test_word not in bag_of_words_of_this_class.keys():
                count_of_this_word_in_this_class = 0
            else:
                count_of_this_word_in_this_class = bag_of_words_of_this_class[test_word]
            sum_of_words_in_this_class = np.sum(list(bag_of_words_of_this_class.values()))
            probability_of_word_in_this_class = (count_of_this_word_in_this_class + 1) / (sum_of_words_in_this_class + len(total_types))
            probabilities['P(\''+test_word+'\'|'+class_namee+')'] = probability_of_word_in_this_class
        # display(probabilities)
        final_probabilities[class_namee] = np.sum(
            np.log(
                list(probabilities.values())
            )
        )
    if show_probabilities :
        display(final_probabilities)
    arg_max = np.argmax(list(final_probabilities.values()))
    return list(final_probabilities.keys())[arg_max]
predict(['predictable', 'with', 'no', 'fun'], show_probabilities=True)

{'Comp.graphics': -12.166081035312091,
 'rec.autos': -11.162523187611859,
 'sci.electronics': -11.999212757101072,
 'soc.religion.christian': -11.556843816381527,
 'talk.politics.mideast': -11.353271838333436}

'rec.autos'

In [176]:
doc_test['words'] = doc_test['path'].apply(lambda path: tokenize(path))
doc_test

Unnamed: 0,path,class,words
0,Dataset/Classification-Train And Test/Test/Com...,Comp.graphics,"[comp.graphics\tcall, presentations, navy, sci..."
1,Dataset/Classification-Train And Test/Test/Com...,Comp.graphics,"[comp.graphics\tre, gray, levels, screen, arti..."
2,Dataset/Classification-Train And Test/Test/rec...,rec.autos,"[rec.autos\tre, saturn, s, pricing, policy, ar..."
3,Dataset/Classification-Train And Test/Test/rec...,rec.autos,"[rec.autos\tre, integra, gsr, article, mwf, b,..."
4,Dataset/Classification-Train And Test/Test/rec...,rec.autos,"[rec.autos\tre, hard, change, springs, f, truc..."
5,Dataset/Classification-Train And Test/Test/sci...,sci.electronics,"[sci.electronics\tre, oscilloscope, triggering..."
6,Dataset/Classification-Train And Test/Test/sci...,sci.electronics,"[sci.electronics\tre, help, ultra, long, timin..."
7,Dataset/Classification-Train And Test/Test/sci...,sci.electronics,"[sci.electronics\tre, info, nec, d, multisync,..."
8,Dataset/Classification-Train And Test/Test/soc...,soc.religion.christian,"[soc.religion.christian\tdoes, god, love, come..."
9,Dataset/Classification-Train And Test/Test/soc...,soc.religion.christian,"[soc.religion.christian\tquality, catholic, li..."


In [177]:
doc_test['predicted'] = doc_test['words'].apply(lambda words: predict(words))

In [178]:
doc_test.get(['class', 'predicted'])

Unnamed: 0,class,predicted
0,Comp.graphics,Comp.graphics
1,Comp.graphics,rec.autos
2,rec.autos,rec.autos
3,rec.autos,rec.autos
4,rec.autos,rec.autos
5,sci.electronics,sci.electronics
6,sci.electronics,sci.electronics
7,sci.electronics,sci.electronics
8,soc.religion.christian,soc.religion.christian
9,soc.religion.christian,soc.religion.christian


In [179]:
from sklearn.metrics import classification_report
print(classification_report(doc_test['class'], doc_test['predicted']))

                        precision    recall  f1-score   support

         Comp.graphics       1.00      0.50      0.67         2
             rec.autos       0.75      1.00      0.86         3
       sci.electronics       1.00      1.00      1.00         3
soc.religion.christian       1.00      1.00      1.00         4
 talk.politics.mideast       1.00      1.00      1.00         5

              accuracy                           0.94        17
             macro avg       0.95      0.90      0.90        17
          weighted avg       0.96      0.94      0.94        17

