<a target="_blank" href="https://colab.research.google.com/github/Deanis/MLEngineering_Capstone_Group3/blob/main/notebooks/pipeline.i:pynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Setup

In [None]:
!pip install kfp pandas transformers google-cloud-storage google-cloud-aiplatform




In [None]:
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component
import pandas as pd
from google.cloud import storage
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
from google.cloud import aiplatform
import os
from typing import NamedTuple


# Constants
bucket_name = 'blank-to-bard'
file_name = 'english-dataset.csv'
project_id = 'ml-class-group-3-capstone'
model_name = 'blank-to-bard'


## Components

In [None]:
@dsl.component(packages_to_install=['pandas', 'google-cloud-storage'])
def get_data(bucket_name: str, file_name: str) -> str:
    from google.cloud import storage
    import pandas as pd
    from io import BytesIO

    # Initialize a client
    storage_client = storage.Client()

    # Access the bucket and the file
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(file_name)

    # Download the data into a pandas dataframe
    data = blob.download_as_text()
    data = pd.read_csv(BytesIO(bytes(data, 'utf-8')))

    # Return the data as a string
    return data.to_csv(index=False)


@component(packages_to_install=['transformers', 'sklearn', 'pandas', 'tensorflow'])
def preprocess_data(csv_data: str) -> NamedTuple('Outputs', [('train_inputs', str), ('validation_inputs', str), ('train_labels', str), ('validation_labels', str)]):
    import pandas as pd
    import numpy as np
    from io import BytesIO
    from transformers import DistilBertTokenizer
    from sklearn.model_selection import train_test_split
    import tensorflow as tf
    from collections import namedtuple

    # Read the data from the CSV string
    data = pd.read_csv(BytesIO(bytes(csv_data, 'utf-8')))

    # Tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Tokenize the data
    input_ids = []
    attention_masks = []

    for text in data['text']:
        inputs = tokenizer.encode_plus(text, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                       return_attention_mask=True, return_tensors='tf')
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    # Ensure labels are in the same order and format as the inputs
    labels = tf.convert_to_tensor(data['label'])

    # Convert TensorFlow tensors to numpy arrays before splitting
    input_ids = input_ids.numpy()
    attention_masks = attention_masks.numpy()
    labels = labels.numpy()

    # Split the data
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2023, test_size=0.2)

    # Convert the numpy arrays to string to return them
    Outputs = namedtuple('Outputs', ['train_inputs', 'validation_inputs', 'train_labels', 'validation_labels'])

    return Outputs(np.array2string(train_inputs), np.array2string(validation_inputs), np.array2string(train_labels), np.array2string(validation_labels))


##@component
# def train_model(data):
#     # Similar code to your model training logic
#     pass

##@component
# def save_model(bucket_name, model):
#     # Similar code to your model saving logic
#     pass

##@component
# def deploy_model(bucket_name, project_id, model_name):
#     # Similar code to your model deployment logic
#     pass


## Pipeline

In [11]:
@dsl.pipeline(
    name='Training pipeline',
    description='A pipeline that downloads, pre-processes data, trains a model, saves it and deploys it.'
)
def training_pipeline(bucket_name: str, file_name: str, project_id: str, model_name: str):
    get_data_task = get_data(bucket_name=bucket_name, file_name=file_name)
    preprocess_data_task = preprocess_data(csv_data=get_data_task.output)
    # train_model_task = train_model(preprocess_data_task.output)
    # save_model_task = save_model(bucket_name, train_model_task.output)
    # deploy_model_task = deploy_model(bucket_name, project_id, model_name)


## Compile

In [12]:
# Compile the pipeline
compiler.Compiler().compile(
    pipeline_func=training_pipeline,
    package_path='training_pipeline.json'
)


## Run

In [None]:
from google.cloud import aiplatform

# Define your GCP project ID and region
region = 'us-central1'

# Instantiate the client
aiplatform.init(project=project_id, location=region)

# Define the pipeline root (a Google Cloud Storage location)
pipeline_root = 'gs://YOUR_BUCKET_NAME/PIPELINE_ROOT'

# Define the display name for the pipeline job
display_name = 'My Training Pipeline'

# Define the pipeline job
pipeline_job = aiplatform.PipelineJob(
    display_name=display_name,
    template_path='training_pipeline.json',
    pipeline_root=pipeline_root
)

# Run the pipeline job
pipeline_job.run()
