In [1]:
import numpy as np
import pandas as pd
from snowflake.ml.modeling.xgboost import XGBClassifier, XGBRegressor
from snowflake.ml.modeling.linear_model import LinearRegression
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import StructType, StructField, DoubleType, StringType, DecimalType
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation

In [2]:
from fosforml.model_manager.snowflakesession import get_session
session = get_session()
session

<snowflake.snowpark.session.Session at 0x7f528881d360>

In [None]:
diamonds = pd.read_csv("Notebooks/Notebooks/Notebook_Upload_test/diamonds.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Notebooks/Notebooks/Notebook_Upload_test/diamonds.csv'

In [4]:
diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [5]:
diamonds_df = session.create_dataframe(diamonds)

In [10]:
for colname in diamonds_df.columns:
    if colname == "TABLE":
        new_colname = "TABLE_PCT"
    else:
        new_colname = str.upper(colname)
    diamonds_df = diamonds_df.with_column_renamed(colname, new_colname)

In [11]:
def fix_values(column):
    return F.upper(F.regexp_replace(F.col(column), '[^a-zA-Z0-9]+', '_'))

In [12]:
for col in ["CUT"]:
    diamonds_df = diamonds_df.with_column(col, fix_values(col))

In [13]:
diamonds_df.show()

-------------------------------------------------------------------------------------------------------------------
|"UNNAMED: 0"  |"CARAT"  |"COLOR"  |"CLARITY"  |"DEPTH"  |"TABLE_PCT"  |"PRICE"  |"X"   |"Y"   |"Z"   |"CUT"      |
-------------------------------------------------------------------------------------------------------------------
|1             |0.23     |E        |SI2        |61.5     |55.0         |326      |3.95  |3.98  |2.43  |IDEAL      |
|2             |0.21     |E        |SI1        |59.8     |61.0         |326      |3.89  |3.84  |2.31  |PREMIUM    |
|3             |0.23     |E        |VS1        |56.9     |65.0         |327      |4.05  |4.07  |2.31  |GOOD       |
|4             |0.29     |I        |VS2        |62.4     |58.0         |334      |4.2   |4.23  |2.63  |PREMIUM    |
|5             |0.31     |J        |SI2        |63.3     |58.0         |335      |4.34  |4.35  |2.75  |GOOD       |
|6             |0.24     |J        |VVS2       |62.8     |57.0         |

In [14]:
diamonds_df = diamonds_df.drop("UNNAMED: 0")

In [15]:
diamonds_df.show()

----------------------------------------------------------------------------------------------------
|"CARAT"  |"COLOR"  |"CLARITY"  |"DEPTH"  |"TABLE_PCT"  |"PRICE"  |"X"   |"Y"   |"Z"   |"CUT"      |
----------------------------------------------------------------------------------------------------
|0.23     |E        |SI2        |61.5     |55.0         |326      |3.95  |3.98  |2.43  |IDEAL      |
|0.21     |E        |SI1        |59.8     |61.0         |326      |3.89  |3.84  |2.31  |PREMIUM    |
|0.23     |E        |VS1        |56.9     |65.0         |327      |4.05  |4.07  |2.31  |GOOD       |
|0.29     |I        |VS2        |62.4     |58.0         |334      |4.2   |4.23  |2.63  |PREMIUM    |
|0.31     |J        |SI2        |63.3     |58.0         |335      |4.34  |4.35  |2.75  |GOOD       |
|0.24     |J        |VVS2       |62.8     |57.0         |336      |3.94  |3.96  |2.48  |VERY_GOOD  |
|0.24     |I        |VVS1       |62.3     |57.0         |336      |3.95  |3.98  |2.47  |VER

In [16]:
for colname in ["CARAT", "DEPTH", "TABLE_PCT", "X", "Y", "Z"]:
    diamonds_df = diamonds_df.with_column(colname, diamonds_df[colname].cast(DoubleType()))

In [17]:
diamonds_df.show()

----------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"DEPTH"  |"TABLE_PCT"  |"X"   |"Y"   |"Z"   |
----------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |61.5     |55.0         |3.95  |3.98  |2.43  |
|E        |SI1        |326      |PREMIUM    |0.21     |59.8     |61.0         |3.89  |3.84  |2.31  |
|E        |VS1        |327      |GOOD       |0.23     |56.9     |65.0         |4.05  |4.07  |2.31  |
|I        |VS2        |334      |PREMIUM    |0.29     |62.4     |58.0         |4.2   |4.23  |2.63  |
|J        |SI2        |335      |GOOD       |0.31     |63.3     |58.0         |4.34  |4.35  |2.75  |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |62.8     |57.0         |3.94  |3.96  |2.48  |
|I        |VVS1       |336      |VERY_GOOD  |0.24     |62.3     |57.0         |3.95  |3.98 

In [18]:
diamonds_df.write.save_as_table('DIAMONDS_CLEAN_DATA', mode='overwrite')

In [19]:
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation
import matplotlib.pyplot as plt

In [20]:
#Normalize carat column
snowml_mns = snowml.MinMaxScaler(input_cols=["CARAT"], output_cols=["CARAT_NORM"])
normalized_diamonds_df = snowml_mns.fit(diamonds_df).transform(diamonds_df)

In [21]:
#Reduce number of decimals
new_col = normalized_diamonds_df.col("CARAT_NORM").cast(DecimalType(7, 6))
normalized_diamonds_df = normalized_diamonds_df.with_column("CARAT_NORM", new_col)
normalized_diamonds_df.show()

-------------------------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"DEPTH"  |"TABLE_PCT"  |"X"   |"Y"   |"Z"   |"CARAT_NORM"  |
-------------------------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |61.5     |55.0         |3.95  |3.98  |2.43  |0.006237      |
|E        |SI1        |326      |PREMIUM    |0.21     |59.8     |61.0         |3.89  |3.84  |2.31  |0.002079      |
|E        |VS1        |327      |GOOD       |0.23     |56.9     |65.0         |4.05  |4.07  |2.31  |0.006237      |
|I        |VS2        |334      |PREMIUM    |0.29     |62.4     |58.0         |4.2   |4.23  |2.63  |0.018711      |
|J        |SI2        |335      |GOOD       |0.31     |63.3     |58.0         |4.34  |4.35  |2.75  |0.022869      |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |62.8     |57.0   

In [22]:
#Using ordinal encoder to transform color and clarity columns from categorical to numerical values
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
}
snowml_oe = snowml.OrdinalEncoder(input_cols=["CUT", "CLARITY"], output_cols=["CUT_OE", "CLARITY_OE"], categories=categories)
ord_encoded_diamonds_df = snowml_oe.fit(normalized_diamonds_df).transform(normalized_diamonds_df)
print(snowml_oe._state_pandas)
ord_encoded_diamonds_df.show()

   _COLUMN_NAME  _CATEGORY  _INDEX
0           CUT      IDEAL       0
1           CUT    PREMIUM       1
2           CUT  VERY_GOOD       2
3           CUT       GOOD       3
4           CUT       FAIR       4
5       CLARITY         IF       0
6       CLARITY       VVS1       1
7       CLARITY       VVS2       2
8       CLARITY        VS1       3
9       CLARITY        VS2       4
10      CLARITY        SI1       5
11      CLARITY        SI2       6
12      CLARITY         I1       7
13      CLARITY         I2       8
14      CLARITY         I3       9
---------------------------------------------------------------------------------------------------------------------------------------------
|"CUT_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"DEPTH"  |"TABLE_PCT"  |"X"   |"Y"   |"Z"   |"CARAT_NORM"  |
---------------------------------------------------------------------------------------------------------------------------------------------
|0.0       |6.

In [23]:
#Using OneHotEncoding for categorical columns
snowml_ohe = snowml.OneHotEncoder(input_cols=["CUT", "COLOR", "CLARITY"], output_cols=["CUT_OHE", "COLOR_OHE", "CLARITY_OHE"])
transformed_diamonds_df = snowml_ohe.fit(ord_encoded_diamonds_df).transform(ord_encoded_diamonds_df)
np.array(transformed_diamonds_df.columns)

array(['CUT_OHE_FAIR', 'CUT_OHE_GOOD', 'CUT_OHE_IDEAL', 'CUT_OHE_PREMIUM',
       'CUT_OHE_VERY_GOOD', 'COLOR_OHE_D', 'COLOR_OHE_E', 'COLOR_OHE_F',
       'COLOR_OHE_G', 'COLOR_OHE_H', 'COLOR_OHE_I', 'COLOR_OHE_J',
       'CLARITY_OHE_I1', 'CLARITY_OHE_IF', 'CLARITY_OHE_SI1',
       'CLARITY_OHE_SI2', 'CLARITY_OHE_VS1', 'CLARITY_OHE_VS2',
       'CLARITY_OHE_VVS1', 'CLARITY_OHE_VVS2', 'CUT_OE', 'CLARITY_OE',
       'COLOR', 'CLARITY', 'PRICE', 'CUT', 'CARAT', 'DEPTH', 'TABLE_PCT',
       'X', 'Y', 'Z', 'CARAT_NORM'], dtype='<U17')

In [24]:
#Categorize all the features for processing
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"]
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TABLE_PCT", "X", "Y", "Z"]

In [25]:
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
    "COLOR": np.array(['D', 'E', 'F', 'G', 'H', 'I', 'J']),
}

In [34]:
preprocessing_pipeline=Pipeline(
    steps=[
            (
                "OE",
                snowml.OrdinalEncoder(
                    input_cols=CATEGORICAL_COLUMNS,
                    output_cols=CATEGORICAL_COLUMNS_OE,
                    categories=categories,
                )
            ),
            (
                "MMS",
                snowml.MinMaxScaler(
                    clip=True,
                    input_cols=NUMERICAL_COLUMNS,
                    output_cols=NUMERICAL_COLUMNS,
                    )
            )    
        ]
)

In [35]:
transformed_diamonds_df = preprocessing_pipeline.fit(diamonds_df).transform(diamonds_df)
transformed_diamonds_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CARAT"               |"DEPTH"              |"TABLE_PCT"          |"X"                  |"Y"                  |"Z"                  |"CUT_OE"  |"COLOR_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.006237006237006237  |0.5138888888888888   |0.23076923076923073  |0.3677839851024209   |0.06757215619694397  |0.07641509433962265  |0.0       |1.0         |6.0           |E        |SI2        |326      |IDEAL      |
|0.002079002079002079  |0.46666666666666656  |0.34615384615384615  |0.36219739292364994  |0.06519524617996604  |0.07264150943396

In [39]:
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error

In [40]:
#Categorize all the features for processing
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"]
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TABLE_PCT", "X", "Y", "Z"]

LABEL_COLUMNS = ['PRICE']
OUTPUT_COLUMNS = ['PREDICTED_PRICE']

In [41]:
#Split training and testing datasets
diamonds_train_df, diamonds_test_df = diamonds_df.random_split(weights=[0.9,0.1], seed=0)

#Run train and test datasets through preprocessing pipeline
train_df = preprocessing_pipeline.fit(diamonds_train_df).transform(diamonds_train_df)
test_df = preprocessing_pipeline.transform(diamonds_test_df)

In [53]:
#Define XGBRegressor
# regressor = XGBRegressor(
#     input_cols=CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS,
#     label_cols=LABEL_COLUMNS,
#     output_cols=OUTPUT_COLUMNS
# )

regressor = LinearRegression(
    input_cols=CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS,
    label_cols=LABEL_COLUMNS,
    output_cols=OUTPUT_COLUMNS
)

In [45]:
!pip install scikit-learn

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [54]:
#Model training
regressor.fit(train_df)

<snowflake.ml.modeling.linear_model.linear_regression.LinearRegression at 0x7fd914793430>

In [55]:
#Model Prediction
result = regressor.predict(test_df)

The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.


In [56]:
#Passing pandas dataframe to snowflake's predict function
regressor.predict(test_df[CATEGORICAL_COLUMNS_OE+NUMERICAL_COLUMNS].to_pandas())

Unnamed: 0,CUT_OE,COLOR_OE,CLARITY_OE,CARAT,DEPTH,TABLE_PCT,X,Y,Z,PREDICTED_PRICE
0,2.0,6.0,5.0,0.020790,0.547222,0.307692,0.391993,0.072496,0.083648,-1985.185007
1,2.0,6.0,5.0,0.022869,0.455556,0.365385,0.408752,0.075212,0.082390,-1845.470888
2,1.0,5.0,3.0,0.008316,0.541667,0.269231,0.369646,0.066893,0.077673,-908.831621
3,2.0,0.0,4.0,0.006237,0.486111,0.346154,0.368715,0.067402,0.075472,39.567270
4,1.0,4.0,5.0,0.020790,0.552778,0.307692,0.398510,0.071986,0.084277,-1296.692120
...,...,...,...,...,...,...,...,...,...,...
5407,2.0,1.0,5.0,0.103950,0.505556,0.269231,0.528864,0.097284,0.109748,2864.664407
5408,1.0,5.0,3.0,0.116424,0.452778,0.365385,0.552142,0.099321,0.109748,3140.751336
5409,2.0,1.0,4.0,0.103950,0.550000,0.326923,0.520484,0.095925,0.111006,3229.459362
5410,0.0,3.0,3.0,0.106029,0.511111,0.250000,0.536313,0.097284,0.111006,3511.058215


In [58]:
mape = mean_absolute_percentage_error(
    df=result,
    y_true_col_names="PRICE",
    y_pred_col_names="PREDICTED_PRICE"
)
result.select("PRICE","PREDICTED_PRICE").show()
print(f"Mean absolute percentage error is: {mape}")

---------------------------------
|"PRICE"  |"PREDICTED_PRICE"    |
---------------------------------
|351      |-1985.1850069018783  |
|353      |-1845.4708883064613  |
|355      |-908.8316211673255   |
|357      |39.56727003622473    |
|554      |-1296.6921202874037  |
|554      |-48.12114064960588   |
|2757     |2934.9474331954298   |
|2759     |3377.154320464458    |
|2759     |3526.044279321387    |
|2762     |3424.3370867229837   |
---------------------------------

Mean absolute percentage error is: 0.44709608200892326


In [59]:
from fosforml import register_model

In [61]:
register_model(
  model_obj=regressor,
  session=session,
  source="Notebook",
  name="Linear_Regression_Model",
  snowflake_df=result,
  dataset_name="Diamond_Regression_dataset",
  dataset_source="SnowflakeDataset",
  description="This is a test model for Linear Regression",
  flavour="snowflake",
  model_type="regression",
  conda_dependencies=["scikit-learn==1.3.2"]
)

Got error object of type 'NoneType' has no len() when trying to read default values from function: <function explained_variance_score.<locals>.explained_variance_score_anon_sproc at 0x7fd9145ac550>. Proceeding without creating optional arguments


Calculating build time metrics

Progress: ██████████████████                                                     25.0%


The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'snowflake-snowpark-python' in the local environment is 1.22.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


Calculating build time metrics

Progress: ███████████████████████████████████                                    50.0%
Calculating build time metrics

Progress: ████████████████████████████████████████████████████                   75.0%
Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


"Model 'MODEL_B29E775A_4228_4D47_ADBB_0840C0AE011E_FDC_LINEAR_REGRESSION_MODEL' registered successfully."

In [62]:
session.sql("SELECT TO_CHAR(CURRENT_TIMESTAMP,'YYYY-MM-DD HH12:MI:SS PM')").to_pandas().squeeze()

'2024-09-27 01:21:34 PM'

In [63]:
current_data = regressor.predict(test_df)
predict_col = list(regressor.predict(test_df).to_pandas()['PREDICTED_PRICE'].to_numpy())
from snowflake.snowpark.functions import lit
current = None
for i in range(len(predict_col)):
    current = current_data.with_column('PREDICTED_PRICE',lit(predict_col[i]))



In [64]:
current_pandas = current.to_pandas()
current_pandas.head()

Unnamed: 0,CARAT,DEPTH,TABLE_PCT,X,Y,Z,CUT_OE,COLOR_OE,CLARITY_OE,COLOR,CLARITY,PRICE,CUT,PREDICTED_PRICE
0,0.02079,0.547222,0.307692,0.391993,0.072496,0.083648,2.0,6.0,5.0,J,SI1,351,VERY_GOOD,3511.058215
1,0.022869,0.455556,0.365385,0.408752,0.075212,0.08239,2.0,6.0,5.0,J,SI1,353,VERY_GOOD,3511.058215
2,0.008316,0.541667,0.269231,0.369646,0.066893,0.077673,1.0,5.0,3.0,I,VS1,355,PREMIUM,3511.058215
3,0.006237,0.486111,0.346154,0.368715,0.067402,0.075472,2.0,0.0,4.0,D,VS2,357,VERY_GOOD,3511.058215
4,0.02079,0.552778,0.307692,0.39851,0.071986,0.084277,1.0,4.0,5.0,H,SI1,554,PREMIUM,3511.058215


In [66]:
reference_data = regressor.predict(test_df)
predict_col = list(regressor.predict(test_df).to_pandas()['PREDICTED_PRICE'].to_numpy())
from snowflake.snowpark.functions import lit
reference = None
for i in range(len(predict_col)):
    reference = reference_data.with_column('PREDICTED_PRICE',lit(predict_col[i]))



In [67]:
reference_pandas = reference.to_pandas()
reference_pandas.head()

Unnamed: 0,CARAT,DEPTH,TABLE_PCT,X,Y,Z,CUT_OE,COLOR_OE,CLARITY_OE,COLOR,CLARITY,PRICE,CUT,PREDICTED_PRICE
0,0.02079,0.547222,0.307692,0.391993,0.072496,0.083648,2.0,6.0,5.0,J,SI1,351,VERY_GOOD,4314.199339
1,0.022869,0.455556,0.365385,0.408752,0.075212,0.08239,2.0,6.0,5.0,J,SI1,353,VERY_GOOD,4314.199339
2,0.008316,0.541667,0.269231,0.369646,0.066893,0.077673,1.0,5.0,3.0,I,VS1,355,PREMIUM,4314.199339
3,0.006237,0.486111,0.346154,0.368715,0.067402,0.075472,2.0,0.0,4.0,D,VS2,357,VERY_GOOD,4314.199339
4,0.02079,0.552778,0.307692,0.39851,0.071986,0.084277,1.0,4.0,5.0,H,SI1,554,PREMIUM,4314.199339


In [68]:
inference_pandas_data = test_df.to_pandas()

In [69]:
session.write_pandas(reference_pandas,table_name="LINEAR_REG_REFERENCE_DATA",overwrite=True).show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CARAT"               |"DEPTH"              |"TABLE_PCT"          |"X"                  |"Y"                  |"Z"                  |"CUT_OE"  |"COLOR_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"PREDICTED_PRICE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.02079002079002079   |0.5472222222222223   |0.3076923076923076   |0.3919925512104283   |0.07249575551782682  |0.08364779874213837  |2.0       |6.0         |5.0           |J        |SI1        |351      |VERY_GOOD  |4314.199339116703  |
|0.02286902286902287   |0.4555555555555555   |0.

In [70]:
session.write_pandas(current_pandas,table_name="LINEAR_REG_CURRENT_DATA",overwrite=True).show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CARAT"               |"DEPTH"              |"TABLE_PCT"          |"X"                  |"Y"                  |"Z"                  |"CUT_OE"  |"COLOR_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"PREDICTED_PRICE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.02079002079002079   |0.5472222222222223   |0.3076923076923076   |0.3919925512104283   |0.07249575551782682  |0.08364779874213837  |2.0       |6.0         |5.0           |J        |SI1        |351      |VERY_GOOD  |3511.05821533032   |
|0.02286902286902287   |0.4555555555555555   |0.

In [71]:
session.write_pandas(inference_pandas_data,table_name="LINEAR_REG_INFERENCE_DATA",overwrite=True).show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CARAT"               |"DEPTH"              |"TABLE_PCT"          |"X"                  |"Y"                  |"Z"                  |"CUT_OE"  |"COLOR_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.02079002079002079   |0.5472222222222223   |0.3076923076923076   |0.3919925512104283   |0.07249575551782682  |0.08364779874213837  |2.0       |6.0         |5.0           |J        |SI1        |351      |VERY_GOOD  |
|0.02286902286902287   |0.4555555555555555   |0.3653846153846153   |0.40875232774674114  |0.07521222410865874  |0.08238993710691