In [1]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5])
    
c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

conn.close()

In [3]:
all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

    
for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])
    

In [4]:
!python -m spacy download ro_core_news_md

Collecting ro-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ro_core_news_md-3.7.0/ro_core_news_md-3.7.0-py3-none-any.whl (42.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ro_core_news_md')


In [5]:
# Extract most common parts of speech with spacy
import spacy

nlp = spacy.load('ro_core_news_md')

def get_most_common_pos(texts, n=10):
    pos = {}
    tokenized_texts = []
    for text in texts:
        doc = nlp(text)
        for token in doc:
            if token.pos_ not in pos:
                pos[token.pos_] = 0
            pos[token.pos_] += 1
            
        tokenized_texts.append([token.pos_ for token in doc])
        
    return [sorted(pos.items(), key=lambda x: x[1], reverse=True)[:n], tokenized_texts]

mc_romana, tk_romana = get_most_common_pos(all_texts["romana"])
mc_moldova, tk_moldova = get_most_common_pos(all_texts["moldova"])

In [6]:
X = tk_romana + tk_moldova
y = ["romana"] * len(tk_romana) + ["moldova"] * len(tk_moldova)

X = np.array([' '.join(tag) for tag in X])
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=11)

text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

parameters = {
    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.01, 0.001)
}

gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=-1, verbose=1)

scores = []
gs_scores = []

for train_index, test_index in sss.split(X_train, y_train):
    X_train_split, X_test_split = X_train[train_index], X_train[test_index]
    y_train_split, y_test_split = y_train[train_index], y_train[test_index]
    gs_clf = gs_clf.fit(X_train_split, y_train_split)
    
    scores.append(gs_clf.score(X_test_split, y_test_split))
    gs_scores.append(gs_clf.score(X_test, y_test))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [7]:
print("Mean score: ", np.mean(scores))
print("Mean grid search score: ", np.mean(gs_scores))
print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print("Classification report:\n", classification_report(y_test, gs_clf.predict(X_test), ))

Mean score:  0.7624750499001997
Mean grid search score:  0.7685851318944844
Best parameters:  {'clf__alpha': 0.01, 'tfidf__use_idf': True}
Best score:  0.7641196013289037
Classification report:
               precision    recall  f1-score   support

     moldova       0.77      1.00      0.87       631
      romana       1.00      0.05      0.09       203

    accuracy                           0.77       834
   macro avg       0.88      0.52      0.48       834
weighted avg       0.82      0.77      0.68       834



In [8]:
from cube.api import Cube

cube_nlp = Cube(verbose=True)

cube_nlp.load("ro")



Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a m

In [13]:
def get_most_common_pos_cube(texts, n=10):
    pos = {}
    tokenized_texts = []
    for text in texts:
        if len(text) > 512:
            try:
                small_text = ''
                text = text.split(' ')
                while len(small_text) + text[0] <= 512:
                    small_text += text[0]
                    text = text[1:]
                text = ' '.join(text)
                doc = cube_nlp(small_text)
            except:
                continue
            
        try:
            doc = cube_nlp(text)
        except:
            continue
        
        print(type(doc))
        break
            
        tokenized_texts.append([token.pos_ for token in doc])
    return [sorted(pos.items(), key=lambda x: x[1], reverse=True)[:n], tokenized_texts]

mc_romana_cube, tk_romana_cube = get_most_common_pos_cube(all_texts["romana"])
mc_moldova_cube, tk_moldova_cube = get_most_common_pos_cube(all_texts["moldova"])

print(tk_romana_cube[:10])
print(tk_moldova_cube[:10])

<class 'cube.io_utils.objects.Document'>
<class 'cube.io_utils.objects.Document'>
[]
[]


In [None]:
# Train a classifier using the most common parts of speech
X = tk_romana_cube + tk_moldova_cube
y = ["romana"] * len(tk_romana_cube["romana"]) + ["moldova"] * len(tk_moldova_cube["moldova"])

X = np.array([' '.join(tag) for tag in X])
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=11)

text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

parameters = {
    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.01, 0.001)
}

gs_clf = GridSearchCV(text_clf, parameters, cv=sss, n_jobs=-1, verbose=1)

scores = []
gs_scores = []

for train_index, test_index in sss.split(X_train, y_train):
    X_train_split, X_test_split = X_train[train_index], X_train[test_index]
    y_train_split, y_test_split = y_train[train_index], y_train[test_index]
    gs_clf = gs_clf.fit(X_train_split, y_train_split)
    
    scores.append(gs_clf.score(X_test_split, y_test_split))
    gs_scores.append(gs_clf.score(X_test, y_test))

In [None]:
print("Mean score: ", np.mean(scores))
print("Mean grid search score: ", np.mean(gs_scores))
print("Best parameters: ", gs_clf.best_params_)
print("Best score: ", gs_clf.best_score_)
print("Classification report: ", classification_report(y_test, gs_clf.predict(X_test), ))