In [1]:
import text_dataset_loader as tdl
import text_processing_utils as tpu
from markov_model import MarkovModelClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
df = tdl.load_dataset()

In [12]:
df.author.value_counts(normalize=True).round(3)*100

robert frost       66.5
edgar allan poe    33.5
Name: author, dtype: float64

In [3]:
stopwords = ['a', 'the', 'and']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[['poem_line']], df['author'], test_size=0.25, random_state=42)

In [5]:
tokenizer = tpu.Tokenizer(stopwords=stopwords, document_column='poem_line')
X_train = tokenizer.fit_transform(X_train)
X_test = tokenizer.transform(X_test)

In [6]:
model = MarkovModelClassifier(document_column='poem_line', vocabulary_size=tokenizer.vocab_size)
model.fit(X_train, y_train)

<markov_model.MarkovModelClassifier at 0x7fdb99d885f8>

In [7]:
print(classification_report(y_test.apply(lambda x: model.classes.index(x)).values, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.78      0.98      0.87       352
           1       0.94      0.48      0.64       188

    accuracy                           0.81       540
   macro avg       0.86      0.73      0.75       540
weighted avg       0.84      0.81      0.79       540



In [8]:
print("ROC AUC no conjunto de teste foi de %.1f%%" %(100*roc_auc_score(y_train.apply(lambda x: model.classes.index(x)).values, model.predict_proba(X_train)[:,1])))

ROC AUC no conjunto de teste foi de 100.0%


In [10]:
print("ROC AUC no conjunto de teste foi de %.1f%%" %(100*roc_auc_score(y_test.apply(lambda x: model.classes.index(x)).values, model.predict_proba(X_test)[:,1])))

ROC AUC no conjunto de teste foi de 85.9%
