In [3]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [4]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5].strip())
    
print('Romanian texts:', len(romanian_texts))

c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

print('Moldavian texts:', len(moldavian_texts))

conn.close()

Romanian texts: 15
Moldavian texts: 10


In [5]:
import random

all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])

# Get the number of Romanian articles
num_romanian_articles = len(all_texts["romana"])

# Randomly sample the same number of Moldavian articles
all_texts["moldova"] = random.sample(all_texts["moldova"], num_romanian_articles)

X = []
y = []
for key in all_texts:
    X.extend(all_texts[key])
    y.extend([key]*len(all_texts[key]))
    
X = np.array(X)
y = np.array(y)

In [6]:
from sklearn.linear_model import LogisticRegression

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=11)
text_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=3, max_df=0.7, max_features=10000, stop_words=get_stop_words('ro'))),
        ('clf', LogisticRegression(penalty='l2'))
    ], verbose=True)
parameters = {
    'tfidf__ngram_range': [(1, 4)],
    'tfidf__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=-1, verbose=1)

In [7]:
# plot the first 10 elements of the dataset
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print how many samples from each class are in the training and testing set
    print('Train:', ' romana = ', np.count_nonzero(y_train == 'romana'), ' moldova = ', np.count_nonzero(y_train == 'moldova'))
    print('Test:', np.count_nonzero(y_test == 'romana'), np.count_nonzero(y_test == 'moldova'))

Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102


In [8]:
scores = []
gs_scores = []

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]   
    
    text_clf = text_clf.fit(X_train, y_train)
    scores.append(text_clf.score(X_test, y_test))
    
    gs_clf = gs_clf.fit(X_train, y_train)
    gs_scores.append(gs_clf.score(X_test, y_test))

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   4.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   4.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   4.2s
[Pipeline] ............... (step 2 of 2

In [9]:
print("Mean score: ", np.mean(scores))
print("Mean grid search score: ", np.mean(gs_scores))
print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print("Classification report: ", classification_report(y_test, gs_clf.predict(X_test), ))

Mean score:  0.988562091503268
Mean grid search score:  0.9934640522875817
Best parameters:  {'clf__C': 10, 'tfidf__ngram_range': (1, 4), 'tfidf__use_idf': False}
Best score:  0.9983621446970712
Classification report:                precision    recall  f1-score   support

     moldova       0.98      1.00      0.99       102
      romana       1.00      0.98      0.99       102

    accuracy                           0.99       204
   macro avg       0.99      0.99      0.99       204
weighted avg       0.99      0.99      0.99       204



In [95]:
missclassified = []
for i in range(len(y_test)):
    if y_test[i] != gs_clf.predict(X_test)[i]:
        missclassified.append((X_test[i], y_test[i], gs_clf.predict(X_test)[i]))
        if len(missclassified) == 5:
            break
        
for text, real, predicted in missclassified:
    print("Text: ", text)
    print("Real: ", real)
    print("Predicted: ", predicted)
    print("Index of the text: ", np.where(X_test == text), " Out of ", len(X_test))

Text:  "Ți-am pregătit un program special si te așteptăm cu familia să descoperi expozițiile și programele dedicate! 13.00- 14.30- Ateliere de creație penru copii “Lacul codrilor albaștri”- coordonator Luminița Voica 14.00- 14.10- Vernisajul expoziției de fotografie “Patria mea, limba română”. Expoziția aparține editurii Princeps din Ploiești. Vor fi expuse 80 de fotografii ale poetului Nichita Stănescu- Foaier Sala Victor Ioan Popa 14.10- 14.20- vizionarea filmului realizat la Ploiești, la Teatrul Toma Caragiu, În Memoriam Nichita Stănescu 14.20- 14.30- “Pe lângă plopii fără soț”- moment artistic realizat de studenții UNATC         14.30-14.50- “O, rămâi!” recital de cântec și poezie în interpretarea Ansamblului Fluierasii de la Merisani 14.50-15.00- “Pe lângă plopii fără soț”- moment artistic realizat de studenții UNATC 15.00-15.20- “Stelele-n cer”- recital al reprezentantilor Asociației ArtPulse, studenți la Universitatea Națională de Muzică din București la clasele de canto conf.un

In [93]:
# Predict first 10 elements of the test set
for i in range(len(y_test[:100])):
    prediction = gs_clf.predict([X_test[i]])[0]
    print("Real: ", y_test[i], " Predicted: ", prediction, " Probability: ", gs_clf.predict_proba([X_test[i]])[0])
    if y_test[i] != prediction:
        print("_____________________")


Real:  moldova  Predicted:  moldova  Probability:  [0.99155768 0.00844232]
Real:  romana  Predicted:  romana  Probability:  [4.15249373e-04 9.99584751e-01]
Real:  romana  Predicted:  romana  Probability:  [0.03309088 0.96690912]
Real:  moldova  Predicted:  moldova  Probability:  [0.99105281 0.00894719]
Real:  moldova  Predicted:  moldova  Probability:  [0.99710053 0.00289947]
Real:  romana  Predicted:  romana  Probability:  [5.03690747e-05 9.99949631e-01]
Real:  moldova  Predicted:  moldova  Probability:  [0.99414968 0.00585032]
Real:  romana  Predicted:  romana  Probability:  [3.56981095e-04 9.99643019e-01]
Real:  moldova  Predicted:  moldova  Probability:  [0.99410219 0.00589781]
Real:  romana  Predicted:  romana  Probability:  [1.39824879e-04 9.99860175e-01]
Real:  romana  Predicted:  romana  Probability:  [3.13818981e-04 9.99686181e-01]
Real:  moldova  Predicted:  moldova  Probability:  [0.99125134 0.00874866]
Real:  moldova  Predicted:  moldova  Probability:  [0.98364773 0.0163522