In [7]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [8]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news_diacritics_final.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5].strip())
    
print('Romanian categories:', len(romanian_texts))

c.execute('SELECT * FROM moldova')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

print('Moldavian categories:', len(moldavian_texts))

conn.close()

Romanian categories: 16
Moldavian categories: 14


In [9]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [12]:
def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)
    
  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")     
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][12])

print(no_diacritics("cîțiva", romanian_prefixes))

cativa


In [10]:
romanian=[
    "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei",
    "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta",
    "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia",
    "acești", "aceștia", "acolo", "acord", "acum", "adica", "ai", "aia",
    "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt",
    "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul",
    "alături", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea",
    "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat",
    "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia",
    "atunci", "au", "avea", "avem", "aveţi", "aveți", "avut", "azi", "aş",
    "aşadar", "aţi", "aș", "așadar", "ați", "b", "ba", "bine", "bucur", "bună",
    "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia",
    "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel",
    "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva",
    "cit", "cita", "cite", "citeva", "citi", "câțiva", "conform", "contra",
    "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte",
    "câtva", "câţi", "câți", "cînd", "cît", "cîte", "cîtva", "cîţi", "cîți",
    "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca",
    "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci",
    "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre",
    "deşi", "deși", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar",
    "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei",
    "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "ești", "f",
    "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu",
    "fiţi", "fiți", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie",
    "grație", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in",
    "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi",
    "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă",
    "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi",
    "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi",
    "mulţumesc", "mulți", "mulțumesc", "mâine", "mîine", "mă", "n", "ne",
    "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri",
    "nimic", "niste", "nişte", "niște", "noastre", "noastră", "noi", "noroc",
    "nostri", "nostru", "nou", "noua", "nouă", "noştri", "noștri", "nu",
    "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum",
    "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca",
    "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina",
    "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-",
    "putini", "puţin", "puţina", "puţină", "puțin", "puțina", "puțină", "până",
    "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau",
    "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem",
    "sunteţi", "sunteți", "sus", "sută", "sînt", "sîntem", "sînteţi",
    "sînteți", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp",
    "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul",
    "totusi", "totuşi", "totuși", "toţi", "toți", "trei", "treia", "treilea",
    "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde",
    "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora",
    "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră",
    "voi", "vom", "vor", "vostru", "vouă", "voştri", "voștri", "vreme", "vreo",
    "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi",
    "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît",
    "între", "întrucât", "întrucît", "îţi", "îți", "ăla", "ălea", "ăsta",
    "ăstea", "ăştia", "ăștia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie",
    "șapte", "șase", "și", "știu", "ți", "ție"
]

In [13]:
# # Get all the words from the stop words list and apply the same transformation
stop_words = romanian
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [13]:
print(sorted(stop_words))

['a', 'abia', 'acea', 'aceasta', 'aceea', 'aceeasi', 'acei', 'aceia', 'acel', 'acela', 'acelasi', 'acele', 'acelea', 'acest', 'acesta', 'aceste', 'acestea', 'acestei', 'acesti', 'acestia', 'acestui', 'acolo', 'acord', 'acum', 'adica', 'ai', 'aia', 'aiba', 'aici', 'aiurea', 'al', 'ala', 'alaturi', 'ale', 'alea', 'alt', 'alta', 'altceva', 'altcineva', 'alte', 'altfel', 'alti', 'altii', 'altul', 'am', 'ami', 'ampotriva', 'an', 'anainte', 'anaintea', 'ancat', 'ancit', 'ancotro', 'antre', 'antrucat', 'antrucit', 'anume', 'apoi', 'ar', 'are', 'as', 'asa', 'asadar', 'asemenea', 'asta', 'astazi', 'astea', 'astfel', 'astia', 'asupra', 'atare', 'atat', 'atata', 'atatea', 'atatia', 'ati', 'atit', 'atita', 'atitea', 'atitia', 'atunci', 'au', 'avea', 'avem', 'aveti', 'avut', 'azi', 'b', 'ba', 'bine', 'bucur', 'buna', 'c', 'ca', 'caci', 'cam', 'cand', 'capat', 'care', 'carei', 'careia', 'caror', 'carora', 'carui', 'caruia', 'cat', 'cate', 'cati', 'catre', 'catva', 'caut', 'ce', 'cea', 'ceea', 'cei',

In [14]:
import random

all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])


# Get the number of Romanian articles
num_moldavian_articles = len(all_texts["moldova"])

# Randomly sample the same number of Moldavian articles
all_texts["romana"] = random.sample(all_texts["romana"], num_moldavian_articles)

print(len(all_texts["romana"]))
print(len(all_texts["moldova"]))
# X = []
# y = []
# for key in all_texts:
#     X.extend(all_texts[key])
#     y.extend([key]*len(all_texts[key]))
    
# X = np.array(X)
# y = np.array(y)

X_list = all_texts["romana"] + all_texts["moldova"]
y_list = ["romana"]*num_moldavian_articles + ["moldova"]*num_moldavian_articles

25381
25381


In [15]:
import spacy

nlp = spacy.load('ro_core_news_md')

def tokenize(text):
  text = nlp(text.lower())
  # print ([ w for w in text ])
  return ([w.lemma_.strip() for w in text if w.lemma_.strip() in stop_words])
  # text = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in text ]
  # text = [ word for word in text if word in stop_words]
  # return [ word for word in [ word.lemma_.lower().strip() for word in text ] if word in stop_words]

tokenize('Salut, sunt Ionel acest haiduc cu cai putere                   ')

['fi', 'acest', 'cu']

In [16]:
from tqdm import tqdm

print(X_list[0])

for i in tqdm(range(len(X_list))):
  X_list[i] = tokenize(X_list[i])

print(len(X_list))

România a omis din PNRR investiţiile în irigaţii, însă nici Europa nu are o politică de combatere a schimbărilor climatice, a spus ministrul Investiţiilor, , marţi, la conferinţa România inteligentă, organizată de Antena 3 CNN.  „Această dezbatere poate duce și la anumite concluzii care pot marca schimbarea unei paradigme la nivel european. Nu numai ai noștri au fost netoți. Inclusiv Europa nu a avut o politică de combatere a schimbărilor climatice. Vom discuta acum și poate vă șochez, dar noi avem măsuri de sprijin pentru combaterea efectelor schimbărilor climatice”, a spus ministrul Investiţiilor şi proiectelor europene, Adrian Câciu, marţi, la conferinţa România inteligentă, organizată de Antena 3 CNN. Ministrul Agriculturii a amintit la conferinţa România inteligentă de cei care au „omis” din PNRR tocmai investițiile în irigații.  Din acest motiv, România trebuie să recurgă acum la credite cu dobândă mai avantajoasă de la BEI, dar 1,5 miliarde pentru sistemul de irigații puteau fi 

  6%|▌         | 3132/50762 [02:04<31:32, 25.17it/s]


KeyboardInterrupt: 

In [1]:
# Pickle the data
import pickle

with open("X_list_tokenized.pkl", "wb") as f:
    pickle.dump(X_list, f)

with open("y_list_tokenized.pkl", "wb") as f:
    pickle.dump(y_list, f)

NameError: name 'X_list' is not defined

In [24]:
import pickle

X_list_tokenized = []
y_list_tokenized = []

# Load X_list_tokenized
with open('X_list_tokenized.pkl', 'rb') as file_X_list:
    X_list_tokenized = pickle.load(file_X_list)

# Load y_list_tokenized
with open('y_list_tokenized.pkl', 'rb') as file_y_list:
    y_list_tokenized = pickle.load(file_y_list)

In [25]:
print(len(X_list_tokenized))

50762


In [22]:
import pickle
X_list = []
y_list = []
file_X_list = open('X_list.pkl', 'rb')
X_list = pickle.load(file_X_list)
file_X_list.close()

file_y_list = open('y_list.pkl', 'rb')
y_list = pickle.load(file_y_list)
file_y_list.close()

In [23]:
print(len(X_list))

50762


In [21]:
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

# Filter out articles with more than 2000 tokens
filtered_tokens = [len(article) for article in X_list_tokenized if len(article) <= 1500]

print("Number of tokens in the articles")
print(sorted(filtered_tokens))

# Calculate the maximum number of tokens
max_tokens = max(filtered_tokens)

# Calculate the minimum number of tokens
min_tokens = min(filtered_tokens)

# Calculate the mean number of tokens
mean_tokens = np.mean(filtered_tokens)

# Calculate the median number of tokens
median_tokens = np.median(filtered_tokens)

# Calculate the standard deviation of the number of tokens
std_tokens = np.std(filtered_tokens)

print("Max tokens:", max_tokens)
print("Min tokens:", min_tokens)
print("Mean tokens:", mean_tokens)
print("Median tokens:", median_tokens)
print("Standard deviation tokens:", std_tokens)

fig = go.Figure(data=[go.Histogram(x=filtered_tokens, nbinsx=100)])
fig.update_layout(
    title='Number of tokens in the articles distributed over 100 bins',
    xaxis_title='Number of tokens',
    yaxis_title='Number of articles',
    width=900,
    height=600
)

pyo.iplot(fig)

Number of tokens in the articles
[0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 1

In [35]:
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

# Assuming y_list contains the language labels for each article
# "romana" for Romanian and "moldova" for Moldovan
romana_articles = [article for article, label in zip(X_list_tokenized, y_list) if label == "romana"]
moldova_articles = [article for article, label in zip(X_list_tokenized, y_list) if label == "moldova"]

# Filter out articles with more than 1500 tokens
filtered_romana_tokens = [len(article) for article in romana_articles if len(article) <= 1500]
filtered_moldova_tokens = [len(article) for article in moldova_articles if len(article) <= 1500]

def calculate_statistics(tokens):
    max_tokens = max(tokens)
    min_tokens = min(tokens)
    mean_tokens = np.mean(tokens)
    median_tokens = np.median(tokens)
    std_tokens = np.std(tokens)
    return max_tokens, min_tokens, mean_tokens, median_tokens, std_tokens

# Calculate statistics for Romana articles
romana_stats = calculate_statistics(filtered_romana_tokens)
print("Romana - Max tokens:", romana_stats[0])
print("Romana - Min tokens:", romana_stats[1])
print("Romana - Mean tokens:", romana_stats[2])
print("Romana - Median tokens:", romana_stats[3])
print("Romana - Standard deviation tokens:", romana_stats[4])

# Calculate statistics for Moldovan articles
moldova_stats = calculate_statistics(filtered_moldova_tokens)
print("Moldova - Max tokens:", moldova_stats[0])
print("Moldova - Min tokens:", moldova_stats[1])
print("Moldova - Mean tokens:", moldova_stats[2])
print("Moldova - Median tokens:", moldova_stats[3])
print("Moldova - Standard deviation tokens:", moldova_stats[4])

# Calculate percentage of texts with fewer than 256 tokens
romana_under_256 = len([article for article in romana_articles if len(article) < 256])
moldova_under_256 = len([article for article in moldova_articles if len(article) < 256])

romana_percentage_under_256 = (romana_under_256 / len(filtered_romana_tokens)) * 100
moldova_percentage_under_256 = (moldova_under_256 / len(filtered_moldova_tokens)) * 100

print(f"Percentage of Romanian texts with fewer than 256 tokens: {romana_percentage_under_256:.2f}%")
print(f"Percentage of Moldovan texts with fewer than 256 tokens: {moldova_percentage_under_256:.2f}%")

# Plot histograms for Romana articles
fig_romana = go.Figure(data=[go.Histogram(x=filtered_romana_tokens, nbinsx=100)])
fig_romana.update_layout(
    title='Number of tokens in Romanian articles distributed over 100 bins',
    xaxis_title='Number of tokens',
    yaxis_title='Number of articles',
    width=900,
    height=600
)

# Plot histograms for Moldovan articles
fig_moldova = go.Figure(data=[go.Histogram(x=filtered_moldova_tokens, nbinsx=100)])
fig_moldova.update_layout(
    title='Number of tokens in Moldovan articles distributed over 100 bins',
    xaxis_title='Number of tokens',
    yaxis_title='Number of articles',
    width=900,
    height=600
)

# Display the plots
pyo.iplot(fig_romana)
pyo.iplot(fig_moldova)

Romana - Max tokens: 1377
Romana - Min tokens: 1
Romana - Mean tokens: 183.51056199259085
Romana - Median tokens: 141.0
Romana - Standard deviation tokens: 146.69679686894924
Moldova - Max tokens: 981
Moldova - Min tokens: 0
Moldova - Mean tokens: 227.30207635632954
Moldova - Median tokens: 153.0
Moldova - Standard deviation tokens: 191.1792560353874
Percentage of Romanian texts with fewer than 256 tokens: 79.14%
Percentage of Moldovan texts with fewer than 256 tokens: 66.83%


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def stopwordtokenizer(text):
  return text

X = [sa for sa in X_list_tokenized if len(sa) >= 32] # remove short articles
print(len(X))

y = np.array([0 if y == "romana" else 1 for y in y_list_tokenized])
print(y[:10])


47969
[0 0 0 0 0 0 0 0 0 0]


In [52]:
new_X = []
new_y = []

limit_length = 256

from tqdm import tqdm

for j in tqdm(range(len(X))):
  text = X[j]
  # print(text)
  label = y[j]
  if(len(text) <= limit_length):
    while len(text) < limit_length:
      text.append('')
    new_X.append([str(t) for t in text])
    new_y.append(label)
    continue

  iters = len(text) // limit_length
  for i in range(iters):
    # print(type(text[0]))
    new_X.append([str(t) for t in text[i*limit_length:(i+1)*limit_length]])
    new_y.append(label)

new_X = np.array(new_X)
new_y = np.array(new_y)

100%|██████████| 47969/47969 [00:00<00:00, 53524.82it/s]


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
text_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=3, max_df=0.7, max_features=10000, lowercase=False, tokenizer=stopwordtokenizer, vocabulary=stop_words)),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', LogisticRegression(penalty='l2'))
    ], verbose=True)
parameters = {
    'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3), (4, 4)],
    'tfidf__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=1, verbose=10, error_score='raise')

In [53]:
print(len(new_X))
print(len(new_y))
print(new_X[:2])

52537
52537
[['la' 'al' 'de' 'an' 'avea' 'fi' 'unul' 'cu' 'mult' 'timp' 'avea' 'un'
  'de' 'prin' 'mod' 'care' 'avea' 'pentru' 'fi' 'care' 'avea' 'la' 'ba'
  'chiar' 'un' 'pentru' 'pentru' 'avea' 'pe' 'fost' 'mai' 'un' 'mult'
  'prea' 'mare' 'pentru' 'nevoie' 'al' 'care' 'un' 'de' 'de' 'acesta'
  'pentru' 'fi' 'doar' 'de' 'din' 'fi' 'care' 'un' 'nevoie' 'pentru' 'un'
  'ca' 'avea' 'fi' 'un' 'de' 'al' 'meu' 'de' 'foarte' 'mult' 'vreme' 'fi'
  'un' 'mare' 'la' 'al' 'de' 'an' 'de' 'la' 'de' 'un' 'mai' 'cu' 'un'
  'de' 'cu' 'mult' 'mai' 'la' 'mai' 'un' 'de' 'de' 'ce' 'nu' 'fi' 'un'
  'pentru' 'fi' 'eu' 'pentru' 'acest' 'al' 'dar' 'la' 'un' 'de' 'fi' 'un'
  'pe' 'care' 'avea' 'el' 'cu' 'care' 'avea' 'care' 'avea' 'de' 'avea'
  'al' 'de' 'cu' 'cu' 'de' 'nou' 'din' 'timp' 'ce' 'de' 'pe' 'fi' 'avea'
  'fi' 'pentru' 'mult' 'mai' 'din' 'de' 'al' 'al' 'de' 'cu' 'la' 'de'
  'al' 'un' 'de' 'de' 'patru' 'de' 'peste' 'an' 'acela' 'de' 'fi' 'iar'
  'de' 'fi' 'mult' 'mai' 'fi' 'un' 'de' 'fi' 'un' 'pent

In [54]:
from sklearn.linear_model import LogisticRegression

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
text_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=3, max_df=0.7, max_features=10000, lowercase=False, tokenizer=stopwordtokenizer)),
        # ('pca', PCA(svd_solver='arpack')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', LogisticRegression(penalty='l2'))
    ], verbose=True)
parameters = {
    'tfidf__ngram_range': [(2, 2), (3, 3)],
    'tfidf__use_idf': [True],
    # 'pca__n_components': [0.95, 0.99],
    'clf__C': (0.1, 1, 10),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=1, verbose=10, error_score='raise')

In [55]:
scores = []
gs_scores = []


X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42)

gs_clf.fit(X_train,y_train)


y_pred = gs_clf.predict(X_test)

print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START clf__C=0.1, tfidf__ngram_range=(2, 2), tfidf__use_idf=True..
[Pipeline] ............. (step 1 of 3) Processing tfidf, total=   6.8s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.9s
[CV 1/3; 1/6] END clf__C=0.1, tfidf__ngram_range=(2, 2), tfidf__use_idf=True;, score=0.815 total time=   9.6s
[CV 2/3; 1/6] START clf__C=0.1, tfidf__ngram_range=(2, 2), tfidf__use_idf=True..
[Pipeline] ............. (step 1 of 3) Processing tfidf, total=   6.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.8s
[CV 2/3; 1/6] END clf__C=0.1, tfidf__ngram_range=(2, 2), tfidf__use_idf=True;, score=0.802 total time=   9.3s
[CV 3/3; 1/6] START clf__C=0.1, tfidf__ngram_range=(2, 2), tfidf__use_idf=True..
[Pipeline] ............. (step 1 of 3) Processing t

In [None]:
missclassified = []
for i in range(len(y_test)):
    if y_test[i] != gs_clf.predict(X_test)[i]:
        missclassified.append((X_test[i], y_test[i], gs_clf.predict(X_test)[i]))
        if len(missclassified) == 5:
            break
        
for text, real, predicted in missclassified:
    print("Text: ", text)
    print("Real: ", real)
    print("Predicted: ", predicted)
    print("Index of the text: ", np.where(X_test == text), " Out of ", len(X_test))

Text:  Update 14:36. "La interventia de la zona de triaj din Gara Basarab, se redimensioneaza dispozitivul, ramin 7 autospeciale de stingere cu apa si spuma. Se lucreaza la stingerea ultimelor 3 vagoane", transmite ISU B-IF. Update 13:23: La interventia de la zona de triaj din Gara Basarab, an momentul de fata incendiul este localizat. Au fost afectate 15 vagoane dezafectate. Update 12:47: "Avind in vedere modul de manifestare a incendiului, dispozitivul se suplimenteaza cu 5 cisterne de mare capacitate pentru a realiza rezerva de apa. An total actioneaza 9 autospesciale de stingere cu apa si spuma si 5 cisterne", precizeaza ISU-B-IF. Reprezentantii ISU Bucuresti-Ilfov au anuntat ca pompierii intervin pentru stingerea unui incendiu produs la doua vagoane dezafectate, an zona de triaj dintre Gara Basarab si Podul Grant. An GALERIA FOTO puteti gasi imagini din timpul interventiei pompierilor         ››› Vezi galeria foto ‹‹‹ "Intervenim pentru stingerea unui incendiu produs la doua vagoa

In [None]:
# Predict first 10 elements of the test set
for i in range(len(y_test[:100])):
    prediction = gs_clf.predict([X_test[i]])[0]
    print("Real: ", y_test[i], " Predicted: ", prediction, " Probability: ", gs_clf.predict_proba([X_test[i]])[0])
    if y_test[i] != prediction:
        print("_____________________")


Real:  moldova  Predicted:  moldova  Probability:  [0.99544689 0.00455311]
Real:  moldova  Predicted:  moldova  Probability:  [9.99967272e-01 3.27281458e-05]
Real:  romana  Predicted:  romana  Probability:  [0.02135334 0.97864666]
Real:  romana  Predicted:  romana  Probability:  [0.00709559 0.99290441]
Real:  romana  Predicted:  romana  Probability:  [0.02806754 0.97193246]
Real:  romana  Predicted:  romana  Probability:  [0.03888786 0.96111214]
Real:  moldova  Predicted:  moldova  Probability:  [0.9940769 0.0059231]
Real:  moldova  Predicted:  moldova  Probability:  [9.99967612e-01 3.23876247e-05]
Real:  moldova  Predicted:  moldova  Probability:  [9.99370151e-01 6.29848830e-04]
Real:  romana  Predicted:  romana  Probability:  [0.10092349 0.89907651]
Real:  moldova  Predicted:  moldova  Probability:  [9.99995264e-01 4.73612281e-06]
Real:  moldova  Predicted:  moldova  Probability:  [9.99976670e-01 2.33304056e-05]
Real:  romana  Predicted:  romana  Probability:  [0.01728282 0.98271718]

# SVM

In [69]:
pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12==24.6.*

/bin/bash: /home/alex/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu12==24.6.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-24.6.1-cp39-cp39-manylinux_2_28_x86_64.whl (1207.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 GB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:02[0mm
[?25hCollecting cudf-cu12==24.6.* (from cuml-cu12==24.6.*)
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.6.1-cp39-cp39-manylinux_2_28_x86_64.whl (478.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m478.0/478.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cupy-cuda12x>=12.0.0 (from cuml-cu12==24.6.*)
  Downloading cupy_cuda12x-13.3.0-cp39-cp39-manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting dask-cuda==24.6.* (from cuml-cu12==24.6.*)
  Downloading https

In [71]:
import os
import cuml
from cuml.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Set the LD_LIBRARY_PATH environment variable
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-12.2/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

sss2 = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
text2_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(min_df=0.05, max_df=0.95, max_features=1000, lowercase=False, tokenizer=stopwordtokenizer)),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', SVC(kernel='rbf', probability=True))  # cuML SVC with RBF kernel
    ], verbose=True)
parameters2 = {
    'tfidf__ngram_range': [(2, 2), (3, 3)],
    'tfidf__use_idf': [True],
    'clf__gamma': ['auto']
}
gs2_clf = GridSearchCV(text2_clf, parameters2, cv=sss2, n_jobs=1, verbose=10, error_score='raise')

# Assuming X_train and y_train are your training data
gs2_clf.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3; 1/2] START clf__gamma=auto, tfidf__ngram_range=(2, 2), tfidf__use_idf=True
[Pipeline] ............. (step 1 of 3) Processing tfidf, total=   6.8s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  25.9s
[CV 1/3; 1/2] END clf__gamma=auto, tfidf__ngram_range=(2, 2), tfidf__use_idf=True;, score=0.844 total time=  38.7s
[CV 2/3; 1/2] START clf__gamma=auto, tfidf__ngram_range=(2, 2), tfidf__use_idf=True
[Pipeline] ............. (step 1 of 3) Processing tfidf, total=   6.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=  22.0s
[CV 2/3; 1/2] END clf__gamma=auto, tfidf__ngram_range=(2, 2), tfidf__use_idf=True;, score=0.849 total time=  33.9s
[CV 3/3; 1/2] START clf__gamma=auto, tfidf__ngram_range=(2, 2), tfidf__use_idf=True
[Pipeline] ............. (step 1 

In [72]:
# Show the best parameters and the best score
print("Best parameters: ", gs2_clf.best_params_)
print("Best score: ", gs2_clf.best_score_)
print(classification_report(y_test, gs2_clf.predict(X_test)))


Best parameters:  {'clf__gamma': 'auto', 'tfidf__ngram_range': (2, 2), 'tfidf__use_idf': True}
Best score:  0.84840194384257
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      5315
           1       0.86      0.85      0.85      5193

    accuracy                           0.85     10508
   macro avg       0.85      0.85      0.85     10508
weighted avg       0.85      0.85      0.85     10508



In [73]:
!pip install catboost

/bin/bash: /home/alex/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting catboost
  Downloading catboost-1.2.5-cp39-cp39-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.5-cp39-cp39-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.5 graphviz-0.20.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[

In [74]:
from catboost import CatBoostClassifier
# from sklearn.decomposition import TruncatedSVD

sss3 = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
text3_clf = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(max_features=256, lowercase=False, tokenizer=stopwordtokenizer)),
        # ('pca', TruncatedSVD()),
        # ('scaler', StandardScaler(with_mean=False)),
        ('clf', CatBoostClassifier(verbose=5))
    ], verbose=True)
parameters3 = {
    'tfidf__ngram_range': [(2, 2)],
    'tfidf__use_idf': [True],
    # 'pca__n_components': [128],
    # 'pca__n_iter': [16],
    # 'clf__gamma': ['auto', 0.1]
    'clf__iterations': [64],
    'clf__learning_rate': [0.01, 1]
}
gs3_clf = GridSearchCV(text3_clf, parameters3, cv=sss3, n_jobs=1, verbose=10, error_score='raise')

In [75]:
# for train_index, test_index in sss.split(X,y):
#   X_s_train, X_s_test = X[train_index], X[test_index]
#   y_s_train, y_s_test = y[train_index], y[test_index]

  # gs_clf.fit()

X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42)

gs3_clf.fit(X_train,y_train)


y_pred = gs3_clf.predict(X_test)

print("Best parameters: ", gs3_clf.best_params_)
print("Best score: ", gs3_clf.best_score_)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3; 1/2] START clf__iterations=64, clf__learning_rate=0.01, tfidf__ngram_range=(2, 2), tfidf__use_idf=True
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   6.8s
0:	learn: 0.6876920	total: 72.4ms	remaining: 4.56s
5:	learn: 0.6599737	total: 141ms	remaining: 1.36s
10:	learn: 0.6361211	total: 200ms	remaining: 965ms
15:	learn: 0.6163777	total: 258ms	remaining: 775ms
20:	learn: 0.5985137	total: 314ms	remaining: 642ms
25:	learn: 0.5835658	total: 370ms	remaining: 541ms
30:	learn: 0.5697002	total: 431ms	remaining: 459ms
35:	learn: 0.5583183	total: 484ms	remaining: 376ms
40:	learn: 0.5480704	total: 538ms	remaining: 302ms
45:	learn: 0.5375815	total: 593ms	remaining: 232ms
50:	learn: 0.5300097	total: 656ms	remaining: 167ms
55:	learn: 0.5229630	total: 715ms	remaining: 102ms
60:	learn: 0.5167751	total: 770ms	remaining: 37.9ms
63:	learn: 0.5132290	total: 803ms	remaining: 0us
[Pipeline] ............... (step 2 of 2) Proce