In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Sample data: Documents and their binary labels
documents = [
    "This is a technology article about artificial intelligence.",
    "A recipe for a delicious chocolate cake.",
    "Tips for effective time management.",
    "Latest fashion trends for the season.",
    "Overview of sustainable energy sources.",
    "How to improve your programming skills.",
    "Healthy lifestyle habits for longevity.",
    "Travel guide to exotic destinations.",
]

# We assign binary labels to indicate whether the document is technological or not
labels = [1, 0, 0, 0, 1, 1, 0, 0]



In [2]:
# Creating a word count matrix
#Import of libraries:
#CountVectorizer: A scikit-learn method that converts a collection of text documents into an array of word counts.
#MultinomialNB: The scikit-learn implementation of the multinomial Naive Bayes classifier.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

In [9]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [3]:
X

<8x40 sparse matrix of type '<class 'numpy.int64'>'
	with 44 stored elements in Compressed Sparse Row format>

In [4]:
# Division of the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [10]:
X_train

<6x40 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [11]:
y_train

[1, 0, 0, 1, 0, 0]

In [5]:
# Training the Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
# Evaluation of model accuracy
y_pred = mnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurracy: {accuracy:.2f}')

Acurracy: 0.50


In [7]:
# Classification report
#zero_division=1: Handles the case where there are divisions by zero, setting the values to 1 in the metrics. It can be adjusted according to needs.
print("Classification report:")
print(classification_report(y_test, y_pred, zero_division=1))

Classification report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2

