In [None]:
"""from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import gensim.downloader as api

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_data(file_path):
    try:
        data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
        print("Dataset loaded successfully.")
        return data
    except pd.errors.ParserError:
        print("Error loading dataset.")
        return None

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def prepare_data_tfidf(data_sample):
    data_sample['preprocessed_text'] = data_sample['sentence'].apply(preprocess_text)
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(data_sample['preprocessed_text']).toarray()
    y = data_sample['tag']
    return X, y

def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_model(model, X_train, X_test, y_train, y_test):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-Score: {f1}')
        return accuracy, precision, recall, f1
    except Exception as e:
        print(f"Error evaluating model {model.__class__.__name__}: {e}")
        return None, None, None, None
    
def main():
    # Load data
    data = load_data(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt')
    if data is None:
        return

    # Sample data for quick testing
    data_sample = data.sample(n=2000, random_state=42)

    # Prepare data with TF-IDF
    X, y = prepare_data_tfidf(data_sample)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Gaussian Naive Bayes
    print("\nGaussian Naive Bayes Classifier")
    gaussian_nb = GaussianNB()
    evaluate_model(gaussian_nb, X_train, X_test, y_train, y_test)
    
    # Hyperparameter tuning for Gaussian Naive Bayes
    print("\nTuning Gaussian Naive Bayes")
    param_grid_gnb = {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    }
    tuned_gnb = tune_hyperparameters(gaussian_nb, param_grid_gnb, X_train, y_train)
    print("Tuned Gaussian Naive Bayes")
    evaluate_model(tuned_gnb, X_train, X_test, y_train, y_test)

    # Hyperparameter tuning for Logistic Regression
    param_grid_lr = {
        'C': [0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
    tuned_lr = tune_hyperparameters(LogisticRegression(), param_grid_lr, X_train, y_train)
    
    # Hyperparameter tuning for Random Forest
    param_grid_rf = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    tuned_rf = tune_hyperparameters(RandomForestClassifier(), param_grid_rf, X_train, y_train)

    # Hyperparameter tuning for Gradient Boosting
    param_grid_gb = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0]
    }
    tuned_gb = tune_hyperparameters(GradientBoostingClassifier(), param_grid_gb, X_train, y_train)

    # Hyperparameter tuning for Support Vector Machine
    param_grid_svc = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    }
    tuned_svc = tune_hyperparameters(make_pipeline(StandardScaler(), SVC()), param_grid_svc, X_train, y_train)

    # Hyperparameter tuning for XGBoost
    param_grid_xgb = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
    tuned_xgb = tune_hyperparameters(XGBClassifier(), param_grid_xgb, X_train, y_train)

    # Hyperparameter tuning for Decision Tree
    param_grid_dt = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'auto', 'sqrt', 'log2']
    }
    tuned_dt = tune_hyperparameters(DecisionTreeClassifier(), param_grid_dt, X_train, y_train)

    # Hyperparameter tuning for Multi-layer Perceptron
    param_grid_mlp = {
        'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpclassifier__solver': ['lbfgs', 'sgd', 'adam'],
        'mlpclassifier__alpha': [0.0001, 0.001, 0.01],
        'mlpclassifier__learning_rate': ['constant', 'invscaling', 'adaptive']
    }
    tuned_mlp = tune_hyperparameters(make_pipeline(StandardScaler(), MLPClassifier()), param_grid_mlp, X_train, y_train)

    # Evaluate tuned models
    tuned_models = {
        "Logistic Regression": tuned_lr,
        "Random Forest": tuned_rf,
        "Gradient Boosting": tuned_gb,
        "Support Vector Machine": tuned_svc,
        "XGBoost": tuned_xgb,
        "Decision Tree": tuned_dt,
        "Multi-layer Perceptron": tuned_mlp
    }

    for name, model in tuned_models.items():
        print(f"\n{name} (Tuned)")
        evaluate_model(model, X_train, X_test, y_train, y_test)

  # Ensemble: Voting Classifier with tuned models
    print("\nEnsemble: Voting Classifier (Tuned)")
    voting_classifier = VotingClassifier(estimators=[(name, model) for name, model in tuned_models.items()])
    evaluate_model(voting_classifier, X_train, X_test, y_train, y_test)

    # Ensemble: Stacking Classifier with tuned models
    print("\nEnsemble: Stacking Classifier (Tuned)")
    stacking_classifier = StackingClassifier(estimators=[(name, model) for name, model in tuned_models.items()], final_estimator=LogisticRegression())
    evaluate_model(stacking_classifier, X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()

"""

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
import re
import string
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

try:
    data = pd.read_csv(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
    print("Dataset loaded successfully.")
except pd.errors.ParserError:
    print("Error loading dataset.")

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

# Preprocess text data
data['preprocessed_text'] = data['sentence'].apply(preprocess_text)

# Load pre-trained FastText model
fasttext_vectors = api.load('fasttext-wiki-news-subwords-300')

# Sample data
data_sample = data.sample(n=2000, random_state=42)

# Function to calculate average word vector for a sentence
def average_word_vector(words, model):
    vectors = [model[word] for word in words if word in model.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Calculate average word vectors for each sentence in the sample
X = np.array([average_word_vector(words, fasttext_vectors) for words in data_sample['preprocessed_text']])
y = data_sample['tag']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to tune hyperparameters
def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Model Training and Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

# Train and evaluate Logistic Regression
print("Logistic Regression")
log_reg = LogisticRegression()
param_grid_lr = {'C': [1.0], 'solver': ['newton-cg']}
tuned_lr = tune_hyperparameters(log_reg, param_grid_lr, X_train, y_train)
evaluate_model(tuned_lr, X_train, X_test, y_train, y_test)

# Train and evaluate Gradient Boosting
print("\nGradient Boosting")
gradient_boosting = GradientBoostingClassifier()
evaluate_model(gradient_boosting, X_train, X_test, y_train, y_test)

# Train and evaluate Support Vector Machine
print("\nSupport Vector Machine")
svc = SVC()
evaluate_model(svc, X_train, X_test, y_train, y_test)

# Train and evaluate Gaussian Naive Bayes
print("\nGaussian Naive Bayes Classifier")
gaussian_nb = GaussianNB()
param_grid_gnb = {'var_smoothing': [1e-7]}
tuned_gnb = tune_hyperparameters(gaussian_nb, param_grid_gnb, X_train, y_train)
evaluate_model(tuned_gnb, X_train, X_test, y_train, y_test)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import nltk
import re
import string
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
try:
    data = pd.read_csv(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
    print("Dataset loaded successfully.")
except pd.errors.ParserError:
    print("Error loading dataset.")

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Preprocess text data
data['preprocessed_text'] = data['sentence'].apply(preprocess_text)

# Sample data
data_sample = data.sample(n=2000, random_state=42)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_sample['preprocessed_text'], data_sample['tag'], test_size=0.2, random_state=42)

# Load pre-trained FastText vectors
model = api.load('fasttext-wiki-news-subwords-300')


# Function to get average vector representation of a sentence using FastText embeddings
def get_average_vector(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Convert text data into FastText embeddings
X_train_embeddings = np.array([get_average_vector(text, model) for text in X_train])
X_test_embeddings = np.array([get_average_vector(text, model) for text in X_test])

# Train logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
y_pred = classifier.predict(X_test_embeddings)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Evaluation Metrics for Logistic Regression Classifier with FastText Embeddings:")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')



In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import torch
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load data
try:
    data = pd.read_csv(r'C:\Users\adria\Documents\GitHub\project-nlp\TRAINING_DATA.txt', delimiter='\t', encoding='utf-8', header=None, names=['tag', 'sentence'], error_bad_lines=False, warn_bad_lines=True)
    print("Dataset loaded successfully.")
except pd.errors.ParserError:
    print("Error loading dataset.")

# Sample data
data_sample = data.sample(n=2000, random_state=42)

# Split data into train and test sets
X = data_sample['sentence']
y = data_sample['tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load XLM-R model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to preprocess text using XLM-R tokenizer
def preprocess_text(text_list):
    return [tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")['input_ids'] for text in text_list]

# Custom transformer to extract XLM-R embeddings
class XlmREmbeddingExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        with torch.no_grad():
            outputs = model(**X[0])  # Access the input dictionary
        embeddings = outputs.last_hidden_state.mean(dim=1).tolist()  # Convert tensor to list
        return embeddings

# Pipeline for feature extraction and model training with Gaussian Naive Bayes
pipeline_nb = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text)),
    ('embedding_extractor', XlmREmbeddingExtractor()),
    ('classifier', GaussianNB())
])

# Fit pipeline with Gaussian Naive Bayes
pipeline_nb.fit(X_train, y_train)

# Evaluation on test data with Gaussian Naive Bayes
y_pred_nb = pipeline_nb.predict(X_test)

# Calculate evaluation metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

print("Pipeline Classifier with Gaussian Naive Bayes:")
print(f'Accuracy: {accuracy_nb}')
print(f'Precision: {precision_nb}')
print(f'Recall: {recall_nb}')
print(f'F1-Score: {f1_nb}')

# Pipeline for feature extraction and model training with Logistic Regression
pipeline_lr = Pipeline([
    ('preprocessor', FunctionTransformer(preprocess_text)),
    ('embedding_extractor', XlmREmbeddingExtractor()),
    ('classifier', LogisticRegression(random_state=42))
])

# Fit pipeline with Logistic Regression
pipeline_lr.fit(X_train, y_train)

# Evaluation on test data with Logistic Regression
y_pred_lr = pipeline_lr.predict(X_test)

# Calculate evaluation metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print("\nPipeline Classifier with Logistic Regression:")
print(f'Accuracy: {accuracy_lr}')
print(f'Precision: {precision_lr}')
print(f'Recall: {recall_lr}')
print(f'F1-Score: {f1_lr}')
