# Snowpark Machine Learning

In this section we will cover topics related to:

1. Outline Snowpark Architecture:

    Types of Libraries used for Machine Learning

2. Operationalize snowpark stored procedures

    Use Snowpark Python Stored Procedures in run workloads

Follow the below link for more information:

[Getting Started with ML Development in Snowflake](https://quickstarts.snowflake.com/guide/intro_to_machine_learning_with_snowpark_ml_for_python/#0)

In [None]:
#Gettng Libraries:
import numpy as np
from snowflake.ml.modeling.preprocessing import *
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import *
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.registry import Registry
from snowflake.snowpark.context import get_active_session

In [None]:
#Creating Session:
session =  get_active_session()
session.query_tag = 'ml-example'

In [None]:
df = session.table('snowpark_db.works.diamonds')
df

In [None]:
# data clearning 
for colname in df.columns:
    df = df.with_column_renamed(colname, str.upper(colname))
df = df.with_column("CUT", upper(regexp_replace("CUT", '[^a-zA-Z0-9]+', '_')))
for colname in ["CARAT", "X", "Y", "Z", "DEPTH", "TBL"]:
    df = df.with_column(colname, df[colname].cast(DoubleType()))
df

In [None]:
#Normalize CARAT Column
scaler = MinMaxScaler(input_cols=["CARAT"], output_cols=["CARAT_NORM"])
df = scaler.fit(df).transform(df)

# Reduce the number of decimals
new_col = df.col("CARAT_NORM").cast(DecimalType(7, 6))
df = df.with_column("CARAT_NORM", new_col)
df

In [None]:
#Encode CUT and CLARITY Cols
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
}
encoder = OrdinalEncoder(
    input_cols=["CUT", "CLARITY"],
    output_cols=["CUT_OE", "CLARITY_OE"],
    categories=categories)
df = encoder.fit(df).transform(df)
print(encoder._state_pandas)
df

In [None]:
#Encode Categorical to Numeric Columns
encoder = OneHotEncoder(
    input_cols=["CUT", "COLOR", "CLARITY"],
    output_cols=["CUT_OHE", "COLOR_OHE", "CLARITY_OHE"])
df = encoder.fit(df).transform(df)
np.array(df.columns)

In [None]:
#Build Pipeline
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"]
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TBL", "X", "Y", "Z"]

categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
    "COLOR": np.array(['D', 'E', 'F', 'G', 'H', 'I', 'J']),
}

pipeline = Pipeline(steps=[(
        "OE", OrdinalEncoder(
            input_cols=CATEGORICAL_COLUMNS,
            output_cols=CATEGORICAL_COLUMNS_OE,
            categories=categories,
        )
    ),(
        "MMS", MinMaxScaler(
            clip=True,
            input_cols=NUMERICAL_COLUMNS,
            output_cols=NUMERICAL_COLUMNS,
        )
    )])
df = pipeline.fit(df).transform(df)
df

In [None]:
# split and run the train and test sets through the pipeline
train_df, test_df = df.random_split(weights=[0.9, 0.1], seed=0)

train_df = pipeline.fit(train_df).transform(train_df)
test_df = pipeline.transform(test_df)
test_df

In [None]:
# train model with an XGBoost regressor + make a prediction
regressor = XGBRegressor(
    input_cols=CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS,
    label_cols=['PRICE'],
    output_cols=['PREDICTED_PRICE'])
regressor.fit(train_df)
df = regressor.predict(test_df)
df

In [None]:
# log the model in the internal registry
reg = Registry(session=session, database_name="SNOWPARK_DB", schema_name="WORKS")
#model = reg.delete_model("DIAMONDS_PRICE_PREDICTION")
model = reg.log_model(
    model_name="DIAMONDS_PRICE_PREDICTION",
    version_name='V0',
    model=regressor,
    options={'relax_version': True})
reg.get_model("DIAMONDS_PRICE_PREDICTION").show_versions()

In [None]:
# get the model and run some predictions using the test data
model = reg.get_model("DIAMONDS_PRICE_PREDICTION").version('V0')
df = model.run(test_df, function_name="predict")
df