### Necesasry importations and resourses

In [1]:
import numpy as np
import spacy
import pandas as pd
import nltk
import tensorflow as tf
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk import SnowballStemmer
from DocuMetrics import DocuMetrics as DM
from sklearn.feature_extraction.text import TfidfVectorizer
# Download neccesary resourses
nltk.download('brown')
nltk.download('stopwords')

# Config
en_stops = set(stopwords.words('english'))
englishstemmer = SnowballStemmer('english')
nlp = spacy.load('en_core_web_sm')

# Charge documents from Brown corpus
lore = brown.fileids(categories='lore')
learned = brown.fileids(categories='learned')
belles_letters = brown.fileids(categories='belles_lettres')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\luisa\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Create main doc

In [2]:
# Create and filter documents
main_doc = []
for category in [lore, learned, belles_letters]:
    for fileid in category:
        aux = brown.words(fileids=fileid)
        main_doc.append(' '.join(aux))

# Processing function
def normalize(text):
    doc = nlp(text)
    return ' '.join([token.lower_ for token in doc if not token.is_punct and not token.is_stop and len(token) > 3 and token.is_alpha])


main_doc = [normalize(doc) for doc in main_doc]


### Text vectorization using TF-IDF from DocuMetrics (a fusion between corpy and tfidf projects)

In [3]:
# Text vectorization using TF-IDF
dm = DM(main_doc)
TF_IDF = dm.TF_IDF
vocabulary = dm.get_vocabulary()
vocabulary.append("categoria")

### create and define tags correctly according to the number of categories

In [4]:
# Crate tags
labels = np.concatenate([np.zeros(len(lore)), 
                          np.ones(len(learned)), 
                          2 * np.ones(len(belles_letters))])

TF_IDF2 = np.insert(TF_IDF, TF_IDF.shape[1], labels, axis=1)
df = pd.DataFrame(TF_IDF2, columns=vocabulary)

### Here the elements are arranged and the data set is divided for testing, can modify test_size

In [5]:
# Define X and Y
X = df.iloc[:, 1:(len(vocabulary)-1)].values
y = df.iloc[:, (len(vocabulary)-1)].values

# Codifie tags
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
Y = pd.get_dummies(y_encoded).values

# Split the data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

### Here Define and compile the model, can modify capes and activition 

In [6]:
# Define and compile the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(10, input_shape=(X.shape[1],), activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(6, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(3, activation='softmax'))  

opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])


### Fit model and predictions

In [7]:
# Fit model
model.fit(X_train, y_train, epochs=50)

# Predictions
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)
y_test_class = np.argmax(y_test, axis=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Show results

In [8]:
# print results
print("y_pred_class:", y_pred_class)
print("y_test_class:", y_test_class)

y_pred_class: [2 2 1 2 2 2 2 1 0 2 1 2 2 2 2 2 2 2 0 0 2 2 1 2 0 2 2 2 2 2 2 1 2 2 0 2 0
 1 2 2 1]
y_test_class: [0 0 0 0 1 1 0 1 0 2 1 2 2 2 2 0 2 2 1 1 0 2 1 2 1 2 2 2 2 1 2 1 0 1 1 0 1
 1 0 2 1]


In [9]:
# Calculate and display F1 Score
f1 = metrics.f1_score(y_test_class, y_pred_class, average='weighted', zero_division=0)
print("F1 Score:", f1)

F1 Score: 0.48636712458406767


In [10]:

# Show the classification report and the confusion matrix
print(metrics.classification_report(y_test_class, y_pred_class, zero_division=0))
print(metrics.confusion_matrix(y_test_class, y_pred_class))


              precision    recall  f1-score   support

           0       0.17      0.09      0.12        11
           1       0.86      0.40      0.55        15
           2       0.54      1.00      0.70        15

    accuracy                           0.54        41
   macro avg       0.52      0.50      0.45        41
weighted avg       0.55      0.54      0.49        41

[[ 1  1  9]
 [ 5  6  4]
 [ 0  0 15]]
