In [16]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_dataset(file_path, sample_size):
    try:
        data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
        data = data.sample(n=sample_size, random_state=42)
        print("Dataset loaded successfully.")
        return data
    except pd.errors.ParserError:
        print("Error loading dataset.")
        return None

def preprocess_text(text):
    #text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    concatenated_hidden_states = torch.cat(tuple(hidden_states[-4:]), dim=-1)
    return concatenated_hidden_states.mean(dim=1).squeeze().numpy()

def get_sentence_transformer_embeddings(text, model):
    return model.encode(text)

def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Model: {model.__class__.__name__}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

def main():
    sample_size = 3000
    data = load_dataset(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', sample_size)
    if data is None:
        return

    data['preprocessed_text'] = data['sentence'].apply(preprocess_text)
    
    bert_model_name = "bert-base-multilingual-cased"
    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    bert_model = AutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
    
    sentence_transformer_model = SentenceTransformer('distiluse-base-multilingual-cased')
    
    bert_embeddings = np.array([get_bert_embeddings(sentence, bert_tokenizer, bert_model) for sentence in data['preprocessed_text']])
    st_embeddings = np.array([get_sentence_transformer_embeddings(sentence, sentence_transformer_model) for sentence in data['preprocessed_text']])
    
    # Concatenate BERT embeddings with Sentence Transformer embeddings
    combined_embeddings = np.hstack((bert_embeddings, st_embeddings))
    
    y = data['tag'].values
    X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y, test_size=0.2, random_state=42)

    # Train and evaluate Logistic Regression
    print("Logistic Regression")
    log_reg = LogisticRegression(max_iter=1000)
    param_grid_lr = {'C': [0.1, 1.0, 10.0], 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
    tuned_lr = tune_hyperparameters(log_reg, param_grid_lr, X_train, y_train)
    evaluate_model(tuned_lr, X_train, X_test, y_train, y_test)

    # Train and evaluate Gaussian Naive Bayes
    print("\nGaussian Naive Bayes Classifier")
    gaussian_nb = GaussianNB()
    param_grid_gnb = {'var_smoothing': [1e-7, 1e-8, 1e-9]}
    tuned_gnb = tune_hyperparameters(gaussian_nb, param_grid_gnb, X_train, y_train)
    evaluate_model(tuned_gnb, X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)


  data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)


Dataset loaded successfully.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

rust_model.ot:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Logistic Regression


  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result

Best parameters for LogisticRegression: {'C': 0.1, 'solver': 'newton-cg'}


  ret = line_search_wolfe2(


Model: LogisticRegression
Accuracy: 0.55
Precision: 0.550168918918919
Recall: 0.55
F1-Score: 0.5500600240096039

Gaussian Naive Bayes Classifier
Best parameters for GaussianNB: {'var_smoothing': 1e-07}
Model: GaussianNB
Accuracy: 0.5383333333333333
Precision: 0.5385491882521586
Recall: 0.5383333333333333
F1-Score: 0.5384038894554374


In [19]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_dataset(file_path, sample_size):
    try:
        data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
        data = data.sample(n=sample_size, random_state=42)
        print("Dataset loaded successfully.")
        return data
    except pd.errors.ParserError:
        print("Error loading dataset.")
        return None

def preprocess_text(text):
    #text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    concatenated_hidden_states = torch.cat(tuple(hidden_states[-4:]), dim=-1)
    return concatenated_hidden_states.mean(dim=1).squeeze().numpy()

def get_sentence_transformer_embeddings(text, model):
    return model.encode(text)

def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Model: {model.__class__.__name__}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

def main():
    sample_size = 3000
    data = load_dataset(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', sample_size)
    if data is None:
        return

    data['preprocessed_text'] = data['sentence'].apply(preprocess_text)
    
    bert_model_name = "bert-base-multilingual-cased"
    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, output_hidden_states=True)
    
    sentence_transformer_model = SentenceTransformer('distiluse-base-multilingual-cased')
    
    bert_embeddings = np.array([get_bert_embeddings(sentence, bert_tokenizer, bert_model) for sentence in data['preprocessed_text']])
    st_embeddings = np.array([get_sentence_transformer_embeddings(sentence, sentence_transformer_model) for sentence in data['preprocessed_text']])
    
    # Concatenate BERT embeddings with Sentence Transformer embeddings
    combined_embeddings = np.hstack((bert_embeddings, st_embeddings))
    
    y = data['tag'].values
    X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y, test_size=0.2, random_state=42)

    # Train and evaluate Logistic Regression
    print("Logistic Regression")
    log_reg = LogisticRegression(max_iter=1000)
    param_grid_lr = {'C': [0.1, 1.0, 10.0], 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
    tuned_lr = tune_hyperparameters(log_reg, param_grid_lr, X_train, y_train)
    evaluate_model(tuned_lr, X_train, X_test, y_train, y_test)

    # Train and evaluate Gaussian Naive Bayes
    print("\nGaussian Naive Bayes Classifier")
    gaussian_nb = GaussianNB()
    param_grid_gnb = {'var_smoothing': [1e-7, 1e-8, 1e-9]}
    tuned_gnb = tune_hyperparameters(gaussian_nb, param_grid_gnb, X_train, y_train)
    evaluate_model(tuned_gnb, X_train, X_test, y_train, y_test)

    # Train and evaluate SVM
    print("\nSupport Vector Machine")
    svm = SVC()
    param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']}
    tuned_svm = tune_hyperparameters(svm, param_grid_svm, X_train, y_train)
    evaluate_model(tuned_svm, X_train, X_test, y_train, y_test)


    # Voting Classifier
    print("\nVoting Classifier")
    voting_clf = VotingClassifier(estimators=[
        ('lr', tuned_lr), 
        ('gnb', tuned_gnb), 
        ('svm', tuned_svm), 
        ('rf', tuned_rf)], voting='hard')
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Model: Voting Classifier')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)


  data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)


Dataset loaded successfully.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logistic Regression


  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
  ret = line_search_wolfe2(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result

Best parameters for LogisticRegression: {'C': 0.1, 'solver': 'newton-cg'}


  ret = line_search_wolfe2(


Model: LogisticRegression
Accuracy: 0.5566666666666666
Precision: 0.5563909193557973
Recall: 0.5566666666666666
F1-Score: 0.5564298433048434

Gaussian Naive Bayes Classifier
Best parameters for GaussianNB: {'var_smoothing': 1e-08}
Model: GaussianNB
Accuracy: 0.5483333333333333
Precision: 0.548145666121198
Recall: 0.5483333333333333
F1-Score: 0.5482014356475571

Support Vector Machine
Best parameters for SVC: {'C': 1, 'kernel': 'rbf'}
Model: SVC
Accuracy: 0.5516666666666666
Precision: 0.5514152046783626
Recall: 0.5516666666666666
F1-Score: 0.5514658669730428

Random Forest Classifier


  warn(
  warn(


KeyboardInterrupt: 

In [7]:
# Define the paths for the input and output files
input_file = r'C:\Users\adria\Documents\GitHub\project-nlp\Rea_data_with_predictions.txt'
output_file = '/mnt/data/Rea_data_predictions_and_sentences.txt'

# Read the input file
with open(input_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Create and write to the output file
with open(output_file, 'w', encoding='utf-8') as file:
    for line in lines[1:]:  # Skip the header line
        columns = line.strip().split('\t')
        if len(columns) >= 4:
            prediction = columns[3]
            sentence = columns[1]
            file.write(f'{prediction}\t{sentence}\n')

print(f'Predictions and sentences have been written to {output_file}')


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/Rea_data_predictions_and_sentences.txt'