In [2]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [3]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5].strip())
    
print('Romanian texts:', len(romanian_texts))

c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

print('Moldavian texts:', len(moldavian_texts))

conn.close()

Romanian texts: 15
Moldavian texts: 10


In [4]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [5]:
# Transform î -> â, ș -> ş, ț -> ţ

def normalize_text(text):
    text = text.replace("â", "a")
    text = text.replace("Â", "A")
    text = text.replace("ș", "s")
    text = text.replace("ş", "s")
    text = text.replace("Ș", "S")
    text = text.replace("Ş", "S")
    text = text.replace("ț", "t")
    text = text.replace("ţ", "t")
    text = text.replace("Ț", "T")
    text = text.replace("Ţ", "T")
    text = text.replace("î", "i")
    text = text.replace("Î", "I")
    text = text.replace("ă", "a")
    text = text.replace("Ă", "A")
    return text

for key in moldavian_texts:
    for i in range(len(moldavian_texts[key])):
        moldavian_texts[key][i] = normalize_text(moldavian_texts[key][i])

for key in romanian_texts:
    for i in range(len(romanian_texts[key])):
        romanian_texts[key][i] = normalize_text(romanian_texts[key][i])

print(moldavian_texts["Sport"][0])
print(romanian_texts['Stiri'][0])

 Real Madrid a castigat Supercupa Spaniei, dupa ce in finala a invins-o pe Barcelona, scor 4-1.Madrilenii au avut un start de meci excelent, cu doua goluri marcate in doar trei minute de Vinicius. Brazilianul a reusit hat-trick-ul in minutul 39, dupa ce a transformat un penalty. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pentru Vinicius. Brazilianul a reusit o performanta importanta pentru cariera sa. Starul lui Real Madrid este cel de-al 16-lea jucator din istorie care a marcat de trei ori intr-un "El Clasico". Pe aceasta lista se mai afla: Jaime Lazcano, Joan Ramon i Pera, Ventora, Jesus Narro, Cesar, Evaristo de Macedo, Amancio, Ferenc Puskas, Ivan Zamorano, Fernando Sanudo, Gary Lineker, Romario, Luis Suarez, Karim Benzema si Lionel Messi. Este important de precizat ca Messi este singurul fotbalist dintre cei enumerati care a reusit aceasta performanta de doua ori in cariera, potrivit Fanatik. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pe

In [7]:
def replace_i_prefix(word, prefixes):
    for prefix in prefixes:
      if word.lower().startswith(prefix):
        word = word[len(prefix) - 1:].replace("î", "a")
        word = word[len(prefix) - 1:].replace("Î", "a")
        word = prefix + word

    word = word.replace("î", "i").replace("Î", "I")

    return word

def no_diacritics(text, prefixes):

  if ("î" in text or "Î" in text):
    text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")     
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


for key in moldavian_texts:
    for i in range(len(moldavian_texts[key])):
        moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

for key in romanian_texts:
    for i in range(len(romanian_texts[key])):
        romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

print(moldavian_texts["Sport"][0])
print(romanian_texts['Stiri'][12])

 Real Madrid a castigat Supercupa Spaniei, dupa ce in finala a invins-o pe Barcelona, scor 4-1.Madrilenii au avut un start de meci excelent, cu doua goluri marcate in doar trei minute de Vinicius. Brazilianul a reusit hat-trick-ul in minutul 39, dupa ce a transformat un penalty. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pentru Vinicius. Brazilianul a reusit o performanta importanta pentru cariera sa. Starul lui Real Madrid este cel de-al 16-lea jucator din istorie care a marcat de trei ori intr-un "El Clasico". Pe aceasta lista se mai afla: Jaime Lazcano, Joan Ramon i Pera, Ventora, Jesus Narro, Cesar, Evaristo de Macedo, Amancio, Ferenc Puskas, Ivan Zamorano, Fernando Sanudo, Gary Lineker, Romario, Luis Suarez, Karim Benzema si Lionel Messi. Este important de precizat ca Messi este singurul fotbalist dintre cei enumerati care a reusit aceasta performanta de doua ori in cariera, potrivit Fanatik. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pe

In [12]:
# Get all the words from the stop words list and apply the same transformation
stop_words = get_stop_words('romanian')
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [8]:
import random

all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])

# Get the number of Romanian articles
num_romanian_articles = len(all_texts["romana"])

# Randomly sample the same number of Moldavian articles
all_texts["moldova"] = random.sample(all_texts["moldova"], num_romanian_articles)

X = []
y = []
for key in all_texts:
    X.extend(all_texts[key])
    y.extend([key]*len(all_texts[key]))
    
X = np.array(X)
y = np.array(y)

In [9]:
from sklearn.linear_model import LogisticRegression

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=11)
text_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=3, max_df=0.7, max_features=10000, vocabulary=stop_words)),
        ('clf', LogisticRegression(penalty='l2'))
    ], verbose=True)
parameters = {
    'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3), (4, 4)],
    'tfidf__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=-1, verbose=1)

In [10]:
# plot the first 10 elements of the dataset
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print how many samples from each class are in the training and testing set
    print('Train:', ' romana = ', np.count_nonzero(y_train == 'romana'), ' moldova = ', np.count_nonzero(y_train == 'moldova'))
    print('Test:', np.count_nonzero(y_test == 'romana'), np.count_nonzero(y_test == 'moldova'))

Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102


In [13]:
scores = []
gs_scores = []

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]   
    
    text_clf = text_clf.fit(X_train, y_train)
    scores.append(text_clf.score(X_test, y_test))
    
    gs_clf = gs_clf.fit(X_train, y_train)
    gs_scores.append(gs_clf.score(X_test, y_test))

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[Pipeline] ....

In [14]:
print("Mean score: ", np.mean(scores))
print("Mean grid search score: ", np.mean(gs_scores))
print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print("Classification report: ", classification_report(y_test, gs_clf.predict(X_test), ))

Mean score:  0.7362745098039216
Mean grid search score:  0.7529411764705882
Best parameters:  {'clf__C': 10, 'tfidf__ngram_range': (1, 1), 'tfidf__use_idf': True}
Best score:  0.7576086956521739
Classification report:                precision    recall  f1-score   support

     moldova       0.73      0.75      0.74       102
      romana       0.75      0.73      0.74       102

    accuracy                           0.74       204
   macro avg       0.74      0.74      0.74       204
weighted avg       0.74      0.74      0.74       204



In [56]:
missclassified = []
for i in range(len(y_test)):
    if y_test[i] != gs_clf.predict(X_test)[i]:
        missclassified.append((X_test[i], y_test[i], gs_clf.predict(X_test)[i]))
        if len(missclassified) == 5:
            break
        
for text, real, predicted in missclassified:
    print("Text: ", text)
    print("Real: ", real)
    print("Predicted: ", predicted)
    print("Index of the text: ", np.where(X_test == text), " Out of ", len(X_test))

Text:  Social Trei sute de mii de moldoveni cu cetatenie romana ar risca sa ramana fara buletine romanesti, sustin reprezentantii unei companii juridice. Potrivit acestora, autoritatile romane verifica adresele de domiciliu ale detinatorilor de carti de identitate romanesti, iar daca persoanele nu sunt identificate la acea adresa, documentele le-ar putea fi confiscate.Compania sustine ca aceste lucruri nu sunt justificate, in timp ce autoritatile romane vor sa se asigure ca elimina situatiile in care un numar mare de persoane isi stabilesc, in mod fictiv, domiciliul la o adresa.,,Mai mult noi, cei din Republica Moldova, se afirma ca abuziv detinem acest buletin romanesc si hai sa ne stopeze, sa ne ridice spre anulare cartea de identitate si ori ne dezicem de ea, ori ne conformam legii noi. Sa respectam conditiile cu zece persoane la domiciliu, sa respectam deplasarea si lipsa de la domiciliu pana la 15 zile. Sa declaram daca lipsim din Romania mai mult de 15 zile", opineaza Ion Gustiuc

In [57]:
# Predict first 10 elements of the test set
for i in range(len(y_test[:100])):
    prediction = gs_clf.predict([X_test[i]])[0]
    print("Real: ", y_test[i], " Predicted: ", prediction, " Probability: ", gs_clf.predict_proba([X_test[i]])[0])
    if y_test[i] != prediction:
        print("_____________________")


Real:  moldova  Predicted:  moldova  Probability:  [0.86579404 0.13420596]
Real:  moldova  Predicted:  romana  Probability:  [0.24321791 0.75678209]
_____________________
Real:  romana  Predicted:  romana  Probability:  [0.04552513 0.95447487]
Real:  romana  Predicted:  romana  Probability:  [0.47528714 0.52471286]
Real:  romana  Predicted:  romana  Probability:  [0.27754786 0.72245214]
Real:  romana  Predicted:  moldova  Probability:  [0.65871061 0.34128939]
_____________________
Real:  moldova  Predicted:  moldova  Probability:  [0.91431567 0.08568433]
Real:  moldova  Predicted:  moldova  Probability:  [0.89258263 0.10741737]
Real:  moldova  Predicted:  moldova  Probability:  [0.93774403 0.06225597]
Real:  romana  Predicted:  moldova  Probability:  [0.94420499 0.05579501]
_____________________
Real:  moldova  Predicted:  romana  Probability:  [0.27045041 0.72954959]
_____________________
Real:  moldova  Predicted:  moldova  Probability:  [0.93230916 0.06769084]
Real:  romana  Predict