In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

import re


# Load the dataset
df = pd.read_csv('TRAINING_DATA.txt', delimiter='\t')

# Rename columns for easier reference
df.columns = ['label', 'sentence']

# Define functions for preprocessing
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
  return re.sub(r'\d+', '', text)

def to_lowercase(text):
  return text.lower()

def stopword_removal(text):
  spanish_stopwords = stopwords.words('spanish')
  return ' '.join([word for word in text.split() if word not in spanish_stopwords])

def lemmatization(text):
  nltk.download('wordnet')  # Download WordNet if not already downloaded
  lemmatizer = WordNetLemmatizer()
  return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def preprocess_text(text):
  text = to_lowercase(text)
  text = remove_punctuation(text)
  text = remove_numbers(text)
  text = stopword_removal(text)
  text = lemmatization(text)
  return text

# Preprocess text
df['sentence'] = df['sentence'].apply(preprocess_text)

# Load pre-trained Spanish word embeddings (replace with your preferred model)
# Options: cc.ca.es/en/corpus/ancor/ or Ines Montani's Spanish embeddings (https://huggingface.co/dccuchile/bert-base-spanish-wwm-cased)
from gensim.models import KeyedVectors
word_vectors_file = 'https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased'  # Replace with your file path
word_model = KeyedVectors.load_word2vec_format(word_vectors_file, binary=True)

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['sentence'])
sequences = tokenizer.texts_to_sequences(df['sentence'])

# Pad sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to numpy array
y = df['label'].values

# Create embedding matrix (consider using word_model.get_vector for OOV words)
embedding_dim = word_model.vector_size  # Get dimension from the loaded model
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
  if word in word_model.vocab:
    embedding_matrix[i] = word_model[word]
  else:
    # Handle out-of-vocabulary words (e.g., assign zero vector or random vector)
    embedding_matrix[i] = np.random.rand(embedding_dim)  # Example for random vector

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network
model = Sequential([
  Embedding(input_dim=len(word_index) + 1,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            input_length=max_sequence_length,
            trainable=False),
  LSTM(128, return_sequences=True),
  LSTM(64),
  Dropout(0.5),
  Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nl

ValueError: invalid literal for int() with base 10: '<!doctype'