In [47]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModel
from joblib import dump, load

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_dataset(file_path, sample_size):
    try:
        data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8',
                            header=None, names=['tag', 'sentence'],quoting=3,
                              error_bad_lines=False, warn_bad_lines=True)
        data = data.sample(n=sample_size, random_state=42)
        print("Dataset loaded successfully.")
        return data
    except pd.errors.ParserError:
        print("Error loading dataset.")
        return None

def preprocess_text(text):
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    concatenated_hidden_states = torch.cat(tuple(hidden_states[-4:]), dim=-1)
    return concatenated_hidden_states.mean(dim=1).squeeze().numpy()

def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Model: {model.__class__.__name__}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

def save_model(model, model_path):
    dump(model, model_path)
    print(f"Model saved to {model_path}")

def load_model(model_path):
    model = load(model_path)
    print(f"Model loaded from {model_path}")
    return model

import pandas as pd
import numpy as np

def predict_new_data(file_path, model_path, tokenizer, model, output_file='predictions.txt'):
    try:
        # Load the new data
        new_data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', quoting=3, header=None, names=['tag', 'sentence'])
    except pd.errors.ParserError as e:
        print(f"Error loading data: {e}")
        return

    # Preprocess the new data
    new_data['preprocessed_text'] = new_data['sentence'].apply(preprocess_text)

    # Get embeddings for the new data
    embeddings = np.array([get_embeddings(sentence, tokenizer, model) for sentence in new_data['preprocessed_text']])

    # Load the trained model
    trained_model = load_model(model_path)

    # Make predictions
    predictions = trained_model.predict(embeddings)
    
    # Add predictions to the new data
    new_data['prediction'] = predictions

    # Save only predictions and sentences to a new text file
    new_data[['prediction', 'sentence']].to_csv(output_file, sep='\t', index=False, header=False)

    print(f"Predictions saved to '{output_file}'")



def inspect_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for i, line in enumerate(lines[:10]):  # Print first 10 lines for inspection
                print(f"Line {i+1}: {line.strip()}")
    except Exception as e:
        print(f"Error reading file: {e}")

def main():
    sample_size = 3000  # Define the sample size here
    data = load_dataset(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', sample_size)
    if data is None:
        return

    # Feature Engineering: Preprocess Text
    data['preprocessed_text'] = data['sentence'].apply(preprocess_text)
    
    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
    
    # Get embeddings using concatenated hidden states from the last four layers of BERT
    embeddings = np.array([get_embeddings(sentence, tokenizer, model) for sentence in data['preprocessed_text']])
    
    y = data['tag'].values
    X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

    # Train and evaluate Logistic Regression
    print("Logistic Regression")
    log_reg = LogisticRegression(max_iter=1000)  # Increase max_iter for convergence
    param_grid_lr = {'C': [0.1, 1.0, 10.0], 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
    tuned_lr = tune_hyperparameters(log_reg, param_grid_lr, X_train, y_train)
    evaluate_model(tuned_lr, X_train, X_test, y_train, y_test)

    # Save the trained Logistic Regression model
    save_model(tuned_lr, 'logistic_regression_model.joblib')

    # Train and evaluate Gaussian Naive Bayes
    print("\nGaussian Naive Bayes Classifier")
    gaussian_nb = GaussianNB()
    param_grid_gnb = {'var_smoothing': [1e-7, 1e-8, 1e-9]}
    tuned_gnb = tune_hyperparameters(gaussian_nb, param_grid_gnb, X_train, y_train)
    evaluate_model(tuned_gnb, X_train, X_test, y_train, y_test)

    # Save the trained Gaussian Naive Bayes model
    save_model(tuned_gnb, 'gaussian_nb_model.joblib')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
if __name__ == "__main__":

    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)
    
    # Predict new data
    predict_new_data(r'C:\Users\adria\Documents\GitHub\project-nlp\REAL_DATA.txt', 'logistic_regression_model.joblib', tokenizer, model)


Model loaded from logistic_regression_model.joblib
Predictions saved to 'predictions.txt'


In [50]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

def plot_roc_curve(model, X_test, y_test):
    y_score = model.predict_proba(X_test)
    y_test_binarized = label_binarize(y_test, classes=model.classes_)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(model.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(len(model.classes_)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {model.classes_[i]} (area = {roc_auc[i]:0.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic: {model.__class__.__name__}')
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curve(tuned_lr, X_test, y_test)
plot_roc_curve(tuned_gnb, X_test, y_test)


ValueError: could not convert string to float: 'Ambos productos serán lanzados en todo el mundo en marzo .'