🛠 **Upload Librerias**

In [None]:
import sparknlp
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
import numpy as np
from pyspark.sql.functions import when, lit
from app_spark_processor import *
from app_text_pprocessor import *
from app_hyperparams import *

🚀 **Inicio del Entrenamiento del Modelo** 🎯

In [None]:
spark = sparknlp.start()
processor = SparkNLPProcessor(spark_session=spark)
spark = processor.get_session()
datasete = DatasetLoader(spark)
test = datasete.load_csv('/datasets/train_dataset.csv')
df_filtered = test.filter(col("sentiment").isin([0, 1]))
df_=TextCleaner(spark, use_lemma=True, use_stop_words=True,expand_contractions=True).clean_dataframe(df_filtered)

In [None]:
def vector_to_dense(vec):
    if isinstance(vec, SparseVector):
        return vec.toArray().tolist() 
    elif isinstance(vec, DenseVector):
        return list(vec) 
    return [0.0] * 5000 

vector_to_dense_udf = udf(vector_to_dense, ArrayType(FloatType()))
df_dense = df_.withColumn("tfidf_dense", vector_to_dense_udf(col("tfidf_features")))
tfidf_numpy = np.vstack(
    df_dense.select("tfidf_dense").rdd.mapPartitions(
        lambda rows: (np.array(row["tfidf_dense"], dtype=np.float32) for row in rows)
    ).collect()
)
np.save("./embeddings/tfidf_features.npy", tfidf_numpy)

In [None]:
bert_numpy = np.vstack(
            df_.select("bert_embeddings").rdd.mapPartitions(
                lambda rows: [np.array(row["bert_embeddings"][0].embeddings) for row in rows]
            ).collect()
        )
np.save("./embeddings/bert_numpy.npy", bert_numpy)

In [None]:
df_ = df_.withColumn("sentiment", when(df_.sentiment.cast("int").isNull(), lit(-1)).otherwise(df_.sentiment.cast("int")))
labels = np.array(
    df_.select("sentiment").rdd.flatMap(lambda x: x).collect()
).astype(np.int64)
np.save("./embeddings/labels.npy", labels)

📂 **Cargar los embeddings si existen**

In [None]:
bert_numpy = np.load("/content/drive/MyDrive/NeoNexus/bert_numpy.npy")
tfidf_numpy = np.load("/content/drive/MyDrive/NeoNexus/tfidf_features.npy")
labels = np.load("/content/drive/MyDrive/NeoNexus/labels.npy")

In [None]:
print(bert_numpy.shape)
print(tfidf_numpy.shape)
print(labels.shape)

In [None]:
study=HyperparameterOptimization(bert_numpy=bert_numpy,tfidf_numpy=tfidf_numpy, labels=labels)
study.optimize()