In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loque\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import json
# Supongamos que tus datos están en un archivo JSON llamado 'converted_data.jsonl'
with open('../scripts/converted_data.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]

# Convertir los datos en un DataFrame de pandas
df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,sentence,token,complexity
0,"Behold, there came up out of the river seven c...",river,0.0
1,I am a fellow bondservant with you and with yo...,brothers,0.0
2,"The man, the lord of the land, said to us, 'By...",brothers,0.05
3,Shimei had sixteen sons and six daughters; but...,brothers,0.15
4,"""He has put my brothers far from me.",brothers,0.263889


In [6]:
# Eliminar posibles filas con valores faltantes
df.dropna(inplace=True)

# Tokenización y limpieza de las oraciones
stop_words = set(stopwords.words('spanish') + list(string.punctuation))

def preprocess(sentence):
    tokens = word_tokenize(sentence.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

df['oracion_procesada'] = df['sentence'].apply(preprocess)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df[['oracion_procesada', 'token']], df['complexity'], test_size=0.2, random_state=42)

In [8]:
y_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear representaciones TF-IDF para las oraciones
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['oracion_procesada']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['oracion_procesada']).toarray()

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Salida entre 0 y 1 para complejidad
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

model.fit(X_train_tfidf, y_train, epochs=10, batch_size=4, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2508c96fee0>

In [None]:
model.evaluate(X_test_tfidf, y_test)