# Poem: Logistic Regression


# 🎓 Library

In [20]:
# Stopwords
import nltk
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /home/tofeha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [40]:
# Misc
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

# Training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_array

DATA = "data/"

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

# Step 1: Loading

In [17]:
poems_df = pd.read_parquet(DATA + "de_poems.parquet")

Our feature is `text`, and the label is `creation`.

We encode only the features.

In [18]:
poems_df.head(3)["text"].values

array(['Gebohrn, und wiederumb, o Mensch, gebohren werden,\nErrettet dich vom Tod und hilfft dir in Beschwerden.',
       'Es suchte niemand Gott, liegt er gleich unterm Leben,\nWär er nicht Lieb und hätt es nicht mit Lieb umbgeben.',
       'Die ist in Gott, und Gott der ist zugleich in Ihr,\nIn der ich leb, und die auch wieder lebt in mir.'],
      dtype=object)

# Step 2: Preprocessing

In [None]:
german_stop_words = stopwords.words("german")

vectorizer = TfidfVectorizer(stop_words=german_stop_words, max_features=5000)

x = vectorizer.fit_transform(poems_df["text"])
y = poems_df["creation"].values

In [None]:
print(f"Currently have {len(poems_df)} poems with {len(vectorizer.get_feature_names_out())} features.")

Currently have 71570 poems with 5000


# Step 3: Data splitting and model training

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [27]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((57256, 5000), (14314, 5000), (57256,), (14314,))

In [28]:
model = LogisticRegression(random_state=42, solver="saga", verbose=10)
model.fit(x_train, y_train)

Epoch 1, change: 1
Epoch 2, change: 0.17352792
Epoch 3, change: 0.096579706
Epoch 4, change: 0.080212948
Epoch 5, change: 0.066735796
Epoch 6, change: 0.061685523
Epoch 7, change: 0.05319287
Epoch 8, change: 0.043009003
Epoch 9, change: 0.039846444
Epoch 10, change: 0.028294431
Epoch 11, change: 0.014313097
Epoch 12, change: 0.0043109217
Epoch 13, change: 0.0038413858
Epoch 14, change: 0.028889383
Epoch 15, change: 0.0036193264
Epoch 16, change: 0.0013738998
Epoch 17, change: 0.000682556
Epoch 18, change: 0.00035554009
Epoch 19, change: 0.00029116061
convergence after 20 epochs took 43 seconds


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   43.1s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   43.1s


array([[4.63269721e-03, 2.06929269e-04, 6.60478406e-04, ...,
        1.02312614e-04, 1.03789047e-04, 7.34274764e-05],
       [3.10618580e-03, 1.49528630e-04, 1.57731738e-03, ...,
        9.04036533e-05, 8.54277762e-05, 5.88958737e-05],
       [4.92271525e-03, 3.17192484e-04, 1.27234381e-03, ...,
        1.43850664e-04, 1.38696834e-04, 9.72331230e-05],
       [7.64630310e-03, 2.60357625e-04, 1.31805127e-03, ...,
        1.40981047e-04, 1.32026359e-04, 9.29805469e-05],
       [9.93328550e-04, 9.35907728e-05, 3.63838524e-04, ...,
        3.80547797e-05, 3.89922574e-05, 2.62096542e-05]])

# Step 4: Evaluation and finetuning

In [35]:
y_found = model.predict(x_test)

In [47]:
print(classification_report(y_test,y_found, zero_division=0))

              precision    recall  f1-score   support

        1095       0.46      0.31      0.37        55
        1200       1.00      0.25      0.40         4
        1240       0.00      0.00      0.00        14
        1339       0.75      0.56      0.64        59
        1357       0.77      0.85      0.81       130
        1503       1.00      1.00      1.00         7
        1506       1.00      0.85      0.92        13
        1514       0.00      0.00      0.00         4
        1515       0.00      0.00      0.00         1
        1518       0.00      0.00      0.00         1
        1520       0.00      0.00      0.00         2
        1523       0.80      0.95      0.87        86
        1524       0.00      0.00      0.00         2
        1525       0.00      0.00      0.00         1
        1528       0.00      0.00      0.00         1
        1531       0.00      0.00      0.00         1
        1534       0.00      0.00      0.00         1
        1535       0.00    

In [42]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_found)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.44
