In [1]:
USER_FLAG = "--user"

In [2]:
!pip3 install {USER_FLAG} google-cloud-aiplatform>=1.7.0 --upgrade
!pip3 install {USER_FLAG} kfp>=1.8.9 google-cloud-pipeline-components>=0.2.0

In [3]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 1.8.22
google_cloud_pipeline_components version: 0.2.0


In [2]:
import os
PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  oa-suarez-prueba


In [3]:
BUCKET_NAME="gs://" + PROJECT_ID + "-oa-suarez-merchan"

In [4]:
import kfp

from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component, pipeline, Artifact, ClassificationMetrics, Input, Output, Model, Metrics

from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from typing import NamedTuple

In [5]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


'gs://oa-suarez-prueba-oa-suarez-merchan/pipeline_root/'

In [6]:
@component(base_image="python:3.9", output_component_file="first-component.yaml")
def product_name(text: str) -> str:
    return text

In [78]:
from kfp.v2.dsl import component, Output, Dataset


@component(packages_to_install=["pandas", "gcsfs"])
def load_data_from_gcs(
    bucket_name: str,
    file_path: str,
    output_data: Output[Dataset]
):
    import gcsfs
    import pandas as pd

    # Carga los datos del archivo CSV en Google Cloud Storage
    fs = gcsfs.GCSFileSystem()
    gcs_file_path = f'gs://{bucket_name}/{file_path}'

    df_nuevo = pd.read_csv(gcs_file_path)

    # Añade una columna `sentimiento` al DataFrame
    df_nuevo['sentimiento'] = pd.NaT
    
    #Guarda el data frame en un CSV
    df_nuevo.to_csv(output_data.path, index=False)

    print(f"Datos cargados y guardados en {output_data.path}")
   


In [79]:
from kfp.v2.dsl import component, Input

@component(packages_to_install=["pandas", "scikit-learn","nltk"])
def train_and_evaluate_model(
    dataset: Input[Dataset],
    model_output: Output[Model]
):
    import pandas as pd
    import sklearn 
    from sklearn.model_selection import train_test_split
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    import joblib
    
    df_nuevo = pd.read_csv(dataset.path)
    # Divide el DataFrame en conjuntos de entrenamiento, validación y prueba
    X_train, X_test, y_train, y_test = train_test_split(df_nuevo['text'], df_nuevo['sentimiento'], test_size=0.25, random_state=42)

    # Entrena un modelo de machine learning para predecir el sentimiento de las reseñas
    nltk.download('vader_lexicon')
    model = SentimentIntensityAnalyzer()

    joblib.dump(model, model_output.path) 

    


In [85]:
from kfp.v2.dsl import component, Input, Output, Model, Dataset

@component(packages_to_install=["pandas", "scikit-learn", "gcsfs", "joblib", "nltk"])
def generate_predictions(
    dataset: Input[Dataset],
    model: Input[Model],
    predictions_output: Output[Dataset]
):
    import pandas as pd
    import joblib
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
    # Cargar el dataset preprocesado
    df = pd.read_csv(dataset.path)
    
    # Generar predicciones
    predicciones = {"texto": [], "prediccion_sentimiento": []}  
    
    # Cargar el modelo entrenado
    sia = joblib.load(model.path)
    
    for i in df['text']:
        score = sia.polarity_scores(i)['pos']
        predicciones['prediccion_sentimiento'].append(score)
        predicciones['texto'].append(i)
    
    df_prediccion = pd.DataFrame(predicciones)
     
    
    # Convertir las predicciones a un DataFrame y guardarlo como CSV
    
    df_prediccion.to_csv(predictions_output.path, index=False)

    print(f"Predicciones guardadas en {predictions_output.path}")

In [86]:
@pipeline(
    name="data-loading-pipeline",
    description="A pipeline that loads data from GCS."
)
def data_loading_pipeline(
    bucket_name: str = "analisis-de-sentimiento",
    business_file_path: str = "texto.csv",
):

    load_business_data_task = load_data_from_gcs(
        bucket_name=bucket_name,
        file_path=business_file_path
    )
    
    train_and_evaluate_model_task = train_and_evaluate_model(
        dataset=load_business_data_task.outputs['output_data']
    )
    generate_predictions_task = generate_predictions(
        dataset=load_business_data_task.outputs['output_data'],
        model=train_and_evaluate_model_task.outputs['model_output']  
    )

In [87]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=data_loading_pipeline,
    package_path="data_loading_pipeline.json"
)

In [88]:
from datetime import datetime
import re

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S").lower()
job_id = "time-series-pipeline-{0}".format(TIMESTAMP)
job_id = re.sub(r"[^-a-z0-9]+", "-", job_id)

job = aiplatform.PipelineJob(
    display_name="time-series-pipeline",
    template_path="data_loading_pipeline.json",
    job_id=job_id,
    enable_caching=True
)

In [89]:
job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/414169182204/locations/us-central1/pipelineJobs/time-series-pipeline-20231110044312
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/414169182204/locations/us-central1/pipelineJobs/time-series-pipeline-20231110044312')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/time-series-pipeline-20231110044312?project=414169182204
