## Migrate schema from Snowflake to Synapse SQL Dedicated Pool

---

### This notebooks demos how to connect to Snowflake, read data from a schema named INFORMATION_SCHEMA, gather the list of the tables for the given schema and move those tables to a Synapse SQL dedicated pool.

<ul>
<li> Define connection source </li>
<li> Specify connection options for the Snowflake instance </li>
<li> Read Snowflake Information_schema.tables to compile list of tables for our schema
    <ul>
    <li> Read each Snowflake table into into Spark DataFrame </li> 
    <li> Write DataFrame to table to Synapse SQL Dedicated pool
</ul>
    </li>
</ul>    






In [1]:
// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.
// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages
// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc respectevly
// You can find instructions how to add customized jars to cluster/session packages at  https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-scala-packages

import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME
val SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

StatementMeta(DemoCluster, 17, 1, Finished, Available)

In [None]:
// setting default paramter 

val sfschema="existing schema"

In [79]:
// To not expose snowflake credentials, it is best practice to store user, password and account in Azure Key Vault service in your suscription
// Please reference https://docs.microsoft.com/en-us/azure/data-factory/store-credentials-in-key-vault how to set up secrets in Azure Key Vault service
// Please note that you need to link your Azure Key Vault service (AKV) to your Synapse workspace  
// mssparkutils package let you retrive your secrets from AKV

val user = mssparkutils.credentials.getSecret("Azure Key Vault name ", "secret name for user","linked service name")
val password = mssparkutils.credentials.getSecret("Azure Key Vault name ", "secret name for password","linked service name")
val account = mssparkutils.credentials.getSecret("Azure Key Vault name ", "secret name for account","linked service name")
val account_URL = "https://" + account + ".azure.snowflakecomputing.com"

StatementMeta(TestS3Cluster, 18, 2, Finished, Available)

In [80]:
// set up options to connect to Snowflake schema public on TESTDB database
 
val sfoptions = Map( 
  "sfUrl" -> account_URL,
  "sfUser"->user,
  "sfPassword"-> password,
  "sfDatabase"-> "TESTDB",
  "sfSchema"-> "PUBLIC",
  "sfWarehouse"-> "COMPUTE_WH"
)

StatementMeta(TestS3Cluster, 18, 3, Finished, Available)

sfoptions: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> PUBLIC, sfPassword -> <password>, sfUser -> <user>, sfWarehouse -> <warehouse>, sfDatabase -> TESTDB)


In [81]:
// Setup options to connect to schema INFORMATION_SCHEMA. That schema in Snowflake contains your database metadata
 
val sfoptions1 = Map( 
  "sfUrl" -> account_URL,
  "sfUser"->user,
  "sfPassword"-> password,
  "sfDatabase"-> "TESTDB",
  "sfSchema"-> "INFORMATION_SCHEMA",
  "sfWarehouse"-> "COMPUTE_WH"
)


StatementMeta(TestS3Cluster, 18, 4, Finished, Available)

sfoptions1: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> INFORMATION_SCHEMA, sfPassword -> <password>, sfUser -> <user>, sfWarehouse -> <warehouse>, sfDatabase -> TESTDB)


In [82]:
// read table INFORMATION_SCHEMA.TABLES into a DataFrame. We need it to compile list of the tables within our schema

val df_tl=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions1).option("dbtable","TABLES").load()
//display(df_tl)

StatementMeta(TestS3Cluster, 18, 5, Finished, Available)

StructuredStream-spark package version: 3.0.0-2.1.1
df_tl: org.apache.spark.sql.DataFrame = [TABLE_CATALOG: string, TABLE_SCHEMA: string ... 20 more fields]


In [86]:
// For easy iteration, convert selected info from DataFrame to collection
val df_tab_list = df_tl.select("table_schema", "table_name").filter("table_schema='PUBLIC'").collect()
//println(df_tab_list)

StatementMeta(TestS3Cluster, 18, 9, Finished, Available)

df_tab_list: Array[org.apache.spark.sql.Row] = Array([PUBLIC,CUSTOMERT], [PUBLIC,CUSTOMER_TEST], [PUBLIC,NATIONT], [PUBLIC,REGIONT])
[Lorg.apache.spark.sql.Row;@6d6b91a4


### Note: 
We are using df.write.synapsesql method to populate table in SQL dedicated pool. If your target schema anything but "dbo", it need to be exist before.
At the same time this target schema should not have the table with name specified in this method. Here is stored procedure you can run to make sure that this requirement is met:

```sql
CREATE PROCEDURE set_sfschema  @schemaname sysname
AS BEGIN
    DECLARE @cr_stmt NVARCHAR(200) = N'CREATE SCHEMA ' + @schemaname; 
    -- to imulate cursor processing
    CREATE TABLE #temp_tbl
    WITH
     ( DISTRIBUTION = ROUND_ROBIN
      )
       AS 
           SELECT  ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS Sequence,
              table_schema , table_name ,
              'DROP TABLE ' +  quotename(table_schema) + '.' + quotename(table_name) as sql_code
              from information_schema.tables WHERE table_schema = @schemaname ; 
    
    DECLARE @nbr_statements INT = (SELECT COUNT(*) FROM #temp_tbl)
    ,       @i INT = 1;

    IF (0 = (SELECT COUNT(*) FROM sys.schemas WHERE name = @schemaname))
      BEGIN
          EXEC sp_executesql @tsql = @cr_stmt;
      END
    ELSE 
       WHILE   @i <= @nbr_statements
          BEGIN
              DECLARE @sql_code NVARCHAR(60) = (SELECT sql_code FROM #temp_tbl WHERE  Sequence =@i);
              EXEC sp_executesql @sql_code;
              SET     @i +=1;
           END
    DROP TABLE #temp_tbl; 
END
GO
```



In [87]:
// For each table in the schema read data from Snowflake table into a DataFrame and write it to Synapse SQL Dedicated Pool.

df_tab_list.foreach(row=>
   {
    val tname = row.getString(1) 
     //println(tname)
    val df_temp=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions).option("dbtable",tname).load()
    val target_table = "SQLdedpool1." +  sfschema  + "." + tname
    println(target_table)
    df_temp.write.synapsesql(target_table, Constants.INTERNAL)
  })

StatementMeta(TestS3Cluster, 18, 10, Finished, Available)

CUSTOMERT
CUSTOMER_TEST
NATIONT
REGIONT
