In [1]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [2]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5].strip())
    
print('Romanian texts:', len(romanian_texts))

c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

print('Moldavian texts:', len(moldavian_texts))

conn.close()

Romanian texts: 15
Moldavian texts: 10


In [4]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [9]:
# # Transform î -> â, ș -> ş, ț -> ţ

# def normalize_text(text):
#     text = text.replace("â", "a")
#     text = text.replace("Â", "A")
#     text = text.replace("ș", "s")
#     text = text.replace("ş", "s")
#     text = text.replace("Ș", "S")
#     text = text.replace("Ş", "S")
#     text = text.replace("ț", "t")
#     text = text.replace("ţ", "t")
#     text = text.replace("Ț", "T")
#     text = text.replace("Ţ", "T")
#     text = text.replace("î", "i")
#     text = text.replace("Î", "I")
#     text = text.replace("ă", "a")
#     text = text.replace("Ă", "A")
#     return text

# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = normalize_text(moldavian_texts[key][i])

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = normalize_text(romanian_texts[key][i])

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][0])

In [21]:
def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)
    
  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")     
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][12])

print(no_diacritics("cîțiva", romanian_prefixes))

cativa


In [6]:
romanian=[
    "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei",
    "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta",
    "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia",
    "acești", "aceștia", "acolo", "acord", "acum", "adica", "ai", "aia",
    "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt",
    "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul",
    "alături", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea",
    "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat",
    "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia",
    "atunci", "au", "avea", "avem", "aveţi", "aveți", "avut", "azi", "aş",
    "aşadar", "aţi", "aș", "așadar", "ați", "b", "ba", "bine", "bucur", "bună",
    "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia",
    "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel",
    "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva",
    "cit", "cita", "cite", "citeva", "citi", "câțiva", "conform", "contra",
    "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte",
    "câtva", "câţi", "câți", "cînd", "cît", "cîte", "cîtva", "cîţi", "cîți",
    "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca",
    "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci",
    "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre",
    "deşi", "deși", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar",
    "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei",
    "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "ești", "f",
    "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu",
    "fiţi", "fiți", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie",
    "grație", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in",
    "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi",
    "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă",
    "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi",
    "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi",
    "mulţumesc", "mulți", "mulțumesc", "mâine", "mîine", "mă", "n", "ne",
    "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri",
    "nimic", "niste", "nişte", "niște", "noastre", "noastră", "noi", "noroc",
    "nostri", "nostru", "nou", "noua", "nouă", "noştri", "noștri", "nu",
    "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum",
    "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca",
    "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina",
    "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-",
    "putini", "puţin", "puţina", "puţină", "puțin", "puțina", "puțină", "până",
    "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau",
    "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem",
    "sunteţi", "sunteți", "sus", "sută", "sînt", "sîntem", "sînteţi",
    "sînteți", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp",
    "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul",
    "totusi", "totuşi", "totuși", "toţi", "toți", "trei", "treia", "treilea",
    "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde",
    "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora",
    "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră",
    "voi", "vom", "vor", "vostru", "vouă", "voştri", "voștri", "vreme", "vreo",
    "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi",
    "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît",
    "între", "întrucât", "întrucît", "îţi", "îți", "ăla", "ălea", "ăsta",
    "ăstea", "ăştia", "ăștia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie",
    "șapte", "șase", "și", "știu", "ți", "ție"
]

In [17]:
# # Get all the words from the stop words list and apply the same transformation
stop_words = romanian
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [20]:
print(sorted(stop_words))

['a', 'abia', 'acea', 'aceasta', 'aceea', 'aceeasi', 'acei', 'aceia', 'acel', 'acela', 'acelasi', 'acele', 'acelea', 'acest', 'acesta', 'aceste', 'acestea', 'acestei', 'acesti', 'acestia', 'acestui', 'acolo', 'acord', 'acum', 'adica', 'ai', 'aia', 'aiba', 'aici', 'aiurea', 'al', 'ala', 'alaturi', 'ale', 'alea', 'alt', 'alta', 'altceva', 'altcineva', 'alte', 'altfel', 'alti', 'altii', 'altul', 'am', 'ami', 'ampotriva', 'an', 'anainte', 'anaintea', 'ancat', 'ancit', 'ancotro', 'antre', 'antrucat', 'antrucit', 'anume', 'apoi', 'ar', 'are', 'as', 'asa', 'asadar', 'asemenea', 'asta', 'astazi', 'astea', 'astfel', 'astia', 'asupra', 'atare', 'atat', 'atata', 'atatea', 'atatia', 'ati', 'atit', 'atita', 'atitea', 'atitia', 'atunci', 'au', 'avea', 'avem', 'aveti', 'avut', 'azi', 'b', 'ba', 'bine', 'bucur', 'buna', 'c', 'ca', 'caci', 'cam', 'cand', 'capat', 'care', 'carei', 'careia', 'caror', 'carora', 'carui', 'caruia', 'cat', 'cate', 'cati', 'catre', 'catva', 'caut', 'ce', 'cea', 'ceea', 'cei',

In [8]:
import random

all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])

# Get the number of Romanian articles
num_romanian_articles = len(all_texts["romana"])

# Randomly sample the same number of Moldavian articles
all_texts["moldova"] = random.sample(all_texts["moldova"], num_romanian_articles)

X = []
y = []
for key in all_texts:
    X.extend(all_texts[key])
    y.extend([key]*len(all_texts[key]))
    
X = np.array(X)
y = np.array(y)

In [9]:
from sklearn.linear_model import LogisticRegression

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=11)
text_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=3, max_df=0.7, max_features=10000, vocabulary=stop_words)),
        ('clf', LogisticRegression(penalty='l2'))
    ], verbose=True)
parameters = {
    'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3), (4, 4)],
    'tfidf__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=-1, verbose=1)

In [10]:
# plot the first 10 elements of the dataset
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print how many samples from each class are in the training and testing set
    print('Train:', ' romana = ', np.count_nonzero(y_train == 'romana'), ' moldova = ', np.count_nonzero(y_train == 'moldova'))
    print('Test:', np.count_nonzero(y_test == 'romana'), np.count_nonzero(y_test == 'moldova'))

Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102
Train:  romana =  916  moldova =  916
Test: 102 102


In [15]:
scores = []
gs_scores = []

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]   
    
    text_clf = text_clf.fit(X_train, y_train)
    scores.append(text_clf.score(X_test, y_test))
    
    gs_clf = gs_clf.fit(X_train, y_train)
    gs_scores.append(gs_clf.score(X_test, y_test))

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.4s
[Pipeline] ....

In [16]:
print("Mean score: ", np.mean(scores))
print("Mean grid search score: ", np.mean(gs_scores))
print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print("Classification report: ", classification_report(y_test, gs_clf.predict(X_test), ))

Mean score:  0.9980392156862745
Mean grid search score:  0.9980392156862745
Best parameters:  {'clf__C': 1, 'tfidf__ngram_range': (1, 1), 'tfidf__use_idf': True}
Best score:  0.9989130434782609
Classification report:                precision    recall  f1-score   support

     moldova       0.99      1.00      1.00       102
      romana       1.00      0.99      1.00       102

    accuracy                           1.00       204
   macro avg       1.00      1.00      1.00       204
weighted avg       1.00      1.00      1.00       204



In [17]:
missclassified = []
for i in range(len(y_test)):
    if y_test[i] != gs_clf.predict(X_test)[i]:
        missclassified.append((X_test[i], y_test[i], gs_clf.predict(X_test)[i]))
        if len(missclassified) == 5:
            break
        
for text, real, predicted in missclassified:
    print("Text: ", text)
    print("Real: ", real)
    print("Predicted: ", predicted)
    print("Index of the text: ", np.where(X_test == text), " Out of ", len(X_test))

Text:  Update 14:36. "La interventia de la zona de triaj din Gara Basarab, se redimensioneaza dispozitivul, ramin 7 autospeciale de stingere cu apa si spuma. Se lucreaza la stingerea ultimelor 3 vagoane", transmite ISU B-IF. Update 13:23: La interventia de la zona de triaj din Gara Basarab, an momentul de fata incendiul este localizat. Au fost afectate 15 vagoane dezafectate. Update 12:47: "Avind in vedere modul de manifestare a incendiului, dispozitivul se suplimenteaza cu 5 cisterne de mare capacitate pentru a realiza rezerva de apa. An total actioneaza 9 autospesciale de stingere cu apa si spuma si 5 cisterne", precizeaza ISU-B-IF. Reprezentantii ISU Bucuresti-Ilfov au anuntat ca pompierii intervin pentru stingerea unui incendiu produs la doua vagoane dezafectate, an zona de triaj dintre Gara Basarab si Podul Grant. An GALERIA FOTO puteti gasi imagini din timpul interventiei pompierilor         ››› Vezi galeria foto ‹‹‹ "Intervenim pentru stingerea unui incendiu produs la doua vagoa

In [18]:
# Predict first 10 elements of the test set
for i in range(len(y_test[:100])):
    prediction = gs_clf.predict([X_test[i]])[0]
    print("Real: ", y_test[i], " Predicted: ", prediction, " Probability: ", gs_clf.predict_proba([X_test[i]])[0])
    if y_test[i] != prediction:
        print("_____________________")


Real:  moldova  Predicted:  moldova  Probability:  [0.99544689 0.00455311]
Real:  moldova  Predicted:  moldova  Probability:  [9.99967272e-01 3.27281458e-05]
Real:  romana  Predicted:  romana  Probability:  [0.02135334 0.97864666]
Real:  romana  Predicted:  romana  Probability:  [0.00709559 0.99290441]
Real:  romana  Predicted:  romana  Probability:  [0.02806754 0.97193246]
Real:  romana  Predicted:  romana  Probability:  [0.03888786 0.96111214]
Real:  moldova  Predicted:  moldova  Probability:  [0.9940769 0.0059231]
Real:  moldova  Predicted:  moldova  Probability:  [9.99967612e-01 3.23876247e-05]
Real:  moldova  Predicted:  moldova  Probability:  [9.99370151e-01 6.29848830e-04]
Real:  romana  Predicted:  romana  Probability:  [0.10092349 0.89907651]
Real:  moldova  Predicted:  moldova  Probability:  [9.99995264e-01 4.73612281e-06]
Real:  moldova  Predicted:  moldova  Probability:  [9.99976670e-01 2.33304056e-05]
Real:  romana  Predicted:  romana  Probability:  [0.01728282 0.98271718]