# Establish Secure Connection to Snowflake

In [1]:
# Snowpark for Python
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf, col, call_udf
from snowflake.snowpark.types import *

# Other
import sys, string, io, os, math
import zipfile
import pickle
import pandas as pd
import json
from cachetools import cached

# increase the max number of columns to display - default 20, switch to truncate view if exceeded
pd.set_option('display.max_columns', 50)
# increase the width of the column so we can see more raw text - default 50 characters
pd.set_option('display.max_colwidth', 150)

In [2]:
connection_parameters = {
    "account": 'wne',
    "user": 'an.jiang@imaginelearning.com',
    "authenticator": 'externalbrowser',
    # "role": os.environ["SNOWFLAKE_ROLE"]
    "warehouse": 'SQL_WH',
    "database": 'weld_north_test',
    "schema": 'zendesk'
  }
# connection_parameters = json.load(open(r'C:\Users\an.jiang\src\snowflake_credentials.json'))

In [3]:
# Create Snowflake Session object
session = Session.builder.configs(connection_parameters).create()
print(session.sql("select current_warehouse(), current_database(), current_schema()").collect())

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
[Row(CURRENT_WAREHOUSE()='SQL_WH', CURRENT_DATABASE()='WELD_NORTH_TEST', CURRENT_SCHEMA()='ZENDESK')]


In [4]:
df = session.table('tickets')
df.limit(2).toPandas()

Unnamed: 0,ID,URL,EXTERNAL_ID,TYPE,SUBJECT,RAW_SUBJECT,DESCRIPTION,PRIORITY,STATUS,RECIPIENT,REQUESTER_ID,SUBMITTER_ID,ASSIGNEE_ID,ORGANIZATION_ID,GROUP_ID,COLLABORATOR_IDS,FOLLOWER_IDS,EMAIL_CC_IDS,FORUM_TOPIC_ID,PROBLEM_ID,HAS_INCIDENTS,DUE_AT,TAGS,VIA,CUSTOM_FIELDS,SATISFACTION_RATING,SHARING_AGREEMENT_IDS,FOLLOWUP_IDS,TICKET_FORM_ID,BRAND_ID,ALLOW_CHANNELBACK,ALLOW_ATTACHMENTS,IS_PUBLIC,CREATED_AT,UPDATED_AT,ROW_CHECKSUM
0,2056793,https://edgenuity.zendesk.com/api/v2/tickets/2056793.json,,question,Chat with Visitor 51763701,Chat with Visitor 51763701,Chat started: 2022-05-03 03:07 PM UTC\nServed by: Tutor-Ms. Jordan\n\nIP: 73.56.221.88\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 14526.57.0) Appl...,normal,closed,,5843480983575,5843480983575,1523684259441,,43851087,[],[],[],,,False,NaT,"[\n ""alliance_partners"",\n ""chat_alliancetutors"",\n ""cleanup_inactive_chats_ran"",\n ""concept_coaching_tutoring"",\n ""no_csat"",\n ""zopim_chat""...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360004307694,False,True,True,2022-05-03 15:10:30,2022-05-07 16:02:16,bff18e65220e4ace4b4727da92136f98e44ba6433111a44b161a6c2b2707fd8b
1,2056806,https://edgenuity.zendesk.com/api/v2/tickets/2056806.json,,question,Chat with YASMINE TORRES,Chat with YASMINE TORRES,Chat started: 2022-05-03 03:14 PM UTC\nServed by: Tutor- Ms. Reasha \n\nIP: 204.108.105.10\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 14526.89.0) ...,normal,closed,,5843578529687,5843578529687,1523315607122,,43851087,[],[],[],,,False,NaT,"[\n ""activitynamewarm-up"",\n ""cc_end_1"",\n ""cc_ended_chat_solved"",\n ""concept_coaching_tutoring"",\n ""contact_type_student"",\n ""courseid24851...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360002067674,False,True,True,2022-05-03 15:14:50,2022-05-07 17:02:33,fe8d178e161074b1e5d1aa2649969eeaa3063189861574a760c04283147446b7


In [5]:
# df.toPandas().to_csv('zendesk_tickets.csv.gz', header=True, index=False, compression='gzip')

In [12]:
df_zendesk = pd.read_csv(r"C:\Users\an.jiang\src\snowpark_demo\zendesk_tickets\zendesk_tickets.csv", low_memory=False)

In [17]:
pd.set_option('display.max_colwidth', 500)

In [18]:
df_zendesk.head(50)

Unnamed: 0,ID,URL,EXTERNAL_ID,TYPE,SUBJECT,RAW_SUBJECT,DESCRIPTION,PRIORITY,STATUS,RECIPIENT,REQUESTER_ID,SUBMITTER_ID,ASSIGNEE_ID,ORGANIZATION_ID,GROUP_ID,COLLABORATOR_IDS,FOLLOWER_IDS,EMAIL_CC_IDS,FORUM_TOPIC_ID,PROBLEM_ID,HAS_INCIDENTS,DUE_AT,TAGS,VIA,CUSTOM_FIELDS,SATISFACTION_RATING,SHARING_AGREEMENT_IDS,FOLLOWUP_IDS,TICKET_FORM_ID,BRAND_ID,ALLOW_CHANNELBACK,ALLOW_ATTACHMENTS,IS_PUBLIC,CREATED_AT,UPDATED_AT,ROW_CHECKSUM
0,422372,https://edgenuity.zendesk.com/api/v2/tickets/422372.json,,,Chat with Shelly Wendt,Chat with Shelly Wendt,"Chat started: 2020-05-14 10:00 PM UTC\nServed by: Colby Beavers\n\nIP: 47.38.249.213\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 12871.76.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.103 Safari/537.36\nCountry: United States\nCity: Denton\nURL: https://r11.core.learn.edgenuity.com/Educator/home/main.aspx\n\nChat ID: 2005.1801676.Rz0DP8CcPBNhQ\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,closed,,410601979913,410601979913,384912700000.0,370288100000.0,41177550.0,[],[],[],,,False,,"[\n ""cleanup_inactive_chats_ran"",\n ""did_24196"",\n ""districtid-24196"",\n ""edgenuity"",\n ""fullname-shelly_wendt"",\n ""lessonname-"",\n ""no_csat"",\n ""ow_route_1"",\n ""ps_chat_rescue"",\n ""ps_route_1"",\n ""realm_11"",\n ""schoolid-75296"",\n ""schoolname-lake_dallas_middle_school"",\n ""sessionid-27292732"",\n ""standard_school"",\n ""type-teacher"",\n ""userid-125492374"",\n ""zopim_chat"",\n ""zopim_chat_ended""\n]","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""unoffered""\n}",[],[],765848,6738268,False,True,True,2020-05-14 22:34:56,2020-05-18 23:03:48,063e380963209baee129380dc9a59bfd37682ff28c0d1b81b6550330bc4ccf74
1,422475,https://edgenuity.zendesk.com/api/v2/tickets/422475.json,,,Missed Chat with Visitor 44314600,Missed Chat with Visitor 44314600,"Chat started: 2020-05-15 01:03 AM UTC\nServed by: -\n\nIP: 99.33.66.116\nUser Agent: Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36\nCountry: United States\nCity: Carson\nURL: https://www.edgenuity.com/lausd/\n\nChat ID: 2005.1801676.Rz0xP64W6WYjq",normal,closed,,410639757713,410639757713,,,41177550.0,[],[],[],,,False,,"[\n ""assign_ps_group"",\n ""edgenuity"",\n ""no_org_email_present"",\n ""no_org_missed_ps_chat"",\n ""ow_route_1"",\n ""ps_chat_rescue"",\n ""zopim_chat"",\n ""zopim_chat_ended"",\n ""zopim_chat_missed""\n]","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""offered""\n}",[],[],765848,6738268,False,True,True,2020-05-15 01:47:42,2020-05-19 04:06:10,cf10b74df7274af64224a808172cf6d75cf4f254b72da9884027d975999d7380
2,422616,https://edgenuity.zendesk.com/api/v2/tickets/422616.json,,question,reports,reports,"Chat started: 2020-05-15 01:40 PM UTC\nServed by: Luke Broesder\n\nIP: 71.92.48.162\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36\nCountry: United States\nCity: Atlanta\nURL: https://henrycounty.owschools.com/owsoo/home?fromLogin=true\n\nChat ID: 2005.1801676.Rz41uy5pura2k\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,closed,,405039840033,405039840033,378288900000.0,369922400000.0,360005600000.0,[],[],[],,,False,,"[\n ""appuserid-2609601"",\n ""appuseruuid-3ff41c86-c90e-46b0-b555-e73a0c3bcafa"",\n ""customerid-4467"",\n ""fullname-shannon_busey"",\n ""odysseyware"",\n ""odysseyware_general_inquiry"",\n ""ow"",\n ""ow_route_1"",\n ""owsoo3"",\n ""producttype-odysseyware"",\n ""schoolid-henrycountyowschoolscom"",\n ""schoolname-impact_academy"",\n ""schooluuid-dd728db2-69c0-44d5-83f0-108c12c22c33"",\n ""sessionid-6c24fa90-8f72-4b28-986e-422663bf534a"",\n ""type-teacher"",\n ""username-shannonbusey"",\n ""zopim_chat"",\n...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""comment"": ""Luke was very helpful. He answered my questions quickly and succinctly."",\n ""id"": 364947315374,\n ""reason"": ""No reason provided"",\n ""reason_id"": 2543568,\n ""score"": ""good""\n}",[],[],360000961374,360002350134,False,True,True,2020-05-15 13:41:29,2020-05-19 19:06:06,5ddd3012d4413519cf3cd0f31b4ac5f282a1699197bba036d6cdbd5adf8510e9
3,422785,https://edgenuity.zendesk.com/api/v2/tickets/422785.json,,,Missed Chat with John Luvera,Missed Chat with John Luvera,"Chat started: 2020-05-15 03:30 PM UTC\nServed by: -\n\nIP: 73.239.73.96\nUser Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15\nCountry: United States\nCity: Bellingham\nURL: https://coupeville.owschools.com/owsoo/home?fromLogin=true\n\nChat ID: 2005.1801676.Rz4Tex9VqjuOn",normal,closed,,410680048413,410680048413,,,360005600000.0,[],[],[],,,False,,"[\n ""appuserid-1352801"",\n ""appuseruuid-e90538a2-b5d7-4a21-98c7-5e90015a89fe"",\n ""assign_ps_group"",\n ""customerid-3849"",\n ""fullname-john_luvera"",\n ""no_csat"",\n ""odysseyware"",\n ""ow_chat_rescuer"",\n ""ow_route_1"",\n ""producttype-odysseyware"",\n ""schoolid-coupevilleowschoolscom"",\n ""schoolname-coupeville_school_district_204"",\n ""schooluuid-abf10c9e-f3fe-4371-9fc8-7b97b289cb6f"",\n ""sessionid-f6df5442-3612-4bf9-93d3-4493968b872a"",\n ""solved_missed_chat"",\n ""type-admin"",\n ""user...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""unoffered""\n}",[],[],360000961374,360002350134,False,True,True,2020-05-15 15:34:17,2020-05-19 17:06:54,0cc6328620c8cd05adcca181563e131c00ca0452b5b86cf0c4b629f8449fc2f4
4,422789,https://edgenuity.zendesk.com/api/v2/tickets/422789.json,,task,"CHEROKEE CENTRAL HIGH SCHOOL, NC , End of Year Check In","CHEROKEE CENTRAL HIGH SCHOOL, NC , End of Year Check In","\n**Purpose:**\nThis touchpoint is to assist customers during end of year clean-up and discussions. This will help them get ready for summer school, and/or back to school. This is a great time to get contact information on who may be running the summer school program, and to schedule some time to reconnect when school starts up in the fall.\n\nDo the research below, fill out your findings, and pull the reports that are applicable. You can use this information to help drive your conversation....",normal,closed,,400604056393,404515983634,404516000000.0,360002500000.0,360000000000.0,"[\n ""361387076528""\n]","[\n ""361387076528""\n]","[\n ""361387076528""\n]",,,False,2020-06-10 17:00:00,"[\n ""am_courseware"",\n ""am_mathew.cantrell"",\n ""am_proactive__success_review"",\n ""amact_success_review"",\n ""dg-macro-360064712313"",\n ""dg-macro-360090314754"",\n ""did_8843"",\n ""end_of_year/summer_school"",\n ""macro-360064712313"",\n ""realm_02"",\n ""solved_to_open"",\n ""standard_school""\n]","{\n ""channel"": ""web"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754,\n ""value"": ""[\""amact_success_review\""]""\n },...","{\n ""score"": ""unoffered""\n}",[],[],360000483473,6738268,False,True,True,2020-05-15 15:36:30,2020-06-14 16:03:06,71061c865493ad2970c119197d7e54bbfee4050014d2127c152533123200eae5
5,422911,https://edgenuity.zendesk.com/api/v2/tickets/422911.json,,,Re: Edgenuity Customization Permissions,Re: Edgenuity Customization Permissions,"It works!! Thank you!\n\nOn Fri, May 15, 2020 at 10:04 AM Megan Snader <Megan.Snader@edgenuity.com> wrote:\n\nHi Christi,\n\n \n\nI just wanted to give you an update that we made a change on the back end that should allow you to search the lesson libraries now when customizing. I impersonated your account to check that it was fixed and it looked correct on my side, but please let me know if you have any additional questions or concerns. Thank you!\n\n \n\n \n\nMegan Snader | Account Manager...",normal,closed,megan.snader@edgenuity.zendesk.com,409780299393,409780299393,365886600000.0,370358500000.0,360000000000.0,"[\n ""409518086694""\n]",[],"[\n ""409518086694""\n]",,,False,,"[\n ""am_inbound_email"",\n ""assign_ps_group"",\n ""closed_by_merge"",\n ""did_25335"",\n ""msnader_zendesk"",\n ""msnader_zendesk_sent"",\n ""realm_15"",\n ""standard_school""\n]","{\n ""channel"": ""email"",\n ""source"": {\n ""from"": {\n ""address"": ""christi.ng@cowetaps.org"",\n ""name"": ""CHRISTI NG""\n },\n ""to"": {\n ""address"": ""megan.snader@edgenuity.zendesk.com"",\n ""name"": ""Edgenuity""\n }\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""unoffered""\n}",[],[],360000483473,6738268,False,True,True,2020-05-15 16:52:32,2020-05-18 19:21:06,68b0a3982e6a4342820ac781a918b5141ed3f321db1b30c9f0cca133e7822961
6,423275,https://edgenuity.zendesk.com/api/v2/tickets/423275.json,,question,stuck in activity,stuck in activity,"Call from: +1 (817) 992-7583\nCall to: +1 (877) 202-0338\nTime of call: May 15, 2020 at 8:32:54 PM\nAnswered by: Greg Friese",normal,closed,,410784818234,395003390113,395003400000.0,360017300000.0,41177550.0,[],[],[],,,False,,"[\n ""courseware_technical_issues"",\n ""csat_phone_call""\n]","{\n ""channel"": ""voice"",\n ""source"": {\n ""from"": {\n ""formatted_phone"": ""+1 (817) 992-7583"",\n ""name"": ""Caller +1 (817) 992-7583"",\n ""phone"": ""+18179927583""\n },\n ""rel"": ""inbound"",\n ""to"": {\n ""formatted_phone"": ""+1 (877) 202-0338"",\n ""name"": ""Edgenuity"",\n ""phone"": ""+18772020338""\n }\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""offered""\n}",[],[],765848,6738268,False,True,False,2020-05-15 20:49:27,2020-05-19 21:03:06,0871aee64599e40ac556fa58d76f4c1ef94757a2ce6f4a3a454e743cc0a3ac25
7,423310,https://edgenuity.zendesk.com/api/v2/tickets/423310.json,,question,student stuck in course,student stuck in course,"Chat started: 2020-05-15 09:13 PM UTC\nServed by: Matthew Corning\n\nIP: 70.119.99.9\nUser Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36\nCountry: United States\nCity: Allen\nURL: https://r19.core.learn.edgenuity.com/Educator/home/main.aspx\n\nChat ID: 2005.1801676.Rz5s0b7WpmjjN\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",high,closed,,405940477174,405940477174,392628700000.0,370317000000.0,41177550.0,[],[],[],,,False,,"[\n ""courseware_technical_issues"",\n ""did_24295"",\n ""districtid-24295"",\n ""edgenuity"",\n ""fullname-laurence_mallory"",\n ""lessonname-"",\n ""lessonname-unit_test"",\n ""ow_route_1"",\n ""ps_chat_rescue"",\n ""ps_route_1"",\n ""realm_19"",\n ""schoolid-73929"",\n ""schoolname-mcmillen_high_school"",\n ""sessionid-21971566"",\n ""standard_school"",\n ""type-teacher"",\n ""userid-205577080"",\n ""vip"",\n ""zopim_chat"",\n ""zopim_chat_ended""\n]","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""offered""\n}",[],[],765848,6738268,False,True,True,2020-05-15 21:33:34,2020-05-20 00:03:37,81464ed17254d84063b420b841c46998eafb2ec90105fd2c4e516f4db431d6d8
8,423341,https://edgenuity.zendesk.com/api/v2/tickets/423341.json,,task,Woodsboro High School - TX - Customize Course,Woodsboro High School - TX - Customize Course,"**Mariel Sobol**​ asked if I could call Chantel Schulz at 361-779-5737 to help her customize a Spanish course for a student who did not pass.\n\nI called Chantel and we reviewed customizing a course, editing course options, creating a new course, and where to find Course Structure. We also learned that we cannot do pretesting with WL courses. We customized just Cheyann Craighead's Spanish II enrollment.",normal,closed,,388949898913,403433001254,403433000000.0,360002600000.0,360000000000.0,"[\n ""360477622167"",\n ""378322749754""\n]","[\n ""360477622167"",\n ""378322749754""\n]",[],,,False,,"[\n ""am_courseware"",\n ""am_rachel.talbot"",\n ""am_reactive__pd_team_request"",\n ""amact_basic_training"",\n ""amact_courses_add_courses"",\n ""amact_courses_update_course_options"",\n ""did_16098"",\n ""jordan_leeann_wolf"",\n ""realm_23"",\n ""standard_school""\n]","{\n ""channel"": ""web"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754,\n ""value"": ""[\""amact_basic_training\"",\""amact...","{\n ""score"": ""unoffered""\n}",[],[],360000483473,6738268,False,True,True,2020-05-15 22:07:38,2020-05-25 15:02:44,8d36f33118a46ad9936cc9ff43799f7ce250d2938b236b7356e48c806e15b265
9,423355,https://edgenuity.zendesk.com/api/v2/tickets/423355.json,,,Chat with CHRISTAL COOPER,Chat with CHRISTAL COOPER,"Chat started: 2020-05-15 10:20 PM UTC\nServed by: Colby Beavers\n\nIP: 172.249.89.1\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36\nCountry: United States\nCity: Wilmington\nURL: https://r21.core.learn.edgenuity.com/Educator/home/main.aspx\n\nChat ID: 2005.1801676.Rz68poOuUISt8\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",high,closed,,360471851268,360471851268,384912700000.0,360004100000.0,41177550.0,[],[],[],,,False,,"[\n ""autosolve_open_chat"",\n ""did_9733"",\n ""districtid-9733"",\n ""edgenuity"",\n ""fullname-christal_cooper"",\n ""lessonname-"",\n ""lessonname-case_study_proposing_education_policy_solutions"",\n ""ow_route_1"",\n ""ps_chat_rescue"",\n ""ps_route_1"",\n ""realm_21"",\n ""sam_elliot.munro"",\n ""schoolid-33296"",\n ""schoolname-harbor_tchr_prep_acd"",\n ""sessionid-28340921"",\n ""standard_school"",\n ""thank_you_chat"",\n ""type-teacher"",\n ""userid-9617619"",\n ""vip"",\n ""vip_lausd"",\n ""zopim_chat""...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360012625933\n },\n {\n ""id"": 81005327\n },\n {\n ""id"": 360007237394\n },\n {\n ""id"": 360033430294\n },\n {\n ""id"": 360042051353\n },\n {\n ""id"": 360037053733\n },\n {\n ""id"": 360029176873\n },\n {\n ""id"": 360037354033\n },\n {\n ""id"": 360006135094\n },\n {\n ""id"": 360036061754\n },\n {\n ""id"": 360035985733\n },\n {\n ...","{\n ""score"": ""offered""\n}",[],[],765848,6738268,False,True,True,2020-05-15 22:26:29,2020-05-20 00:03:40,fefd090f5ba5f566608a7bdcb6bc1c7de2f4650365e2bcc361a4a9439a5fbb30


--------------------

# Explore Historical Data
Lets look at the REVIEWS_ANNOTATED table which has the manually annotated sentiment labels for each product review.

In [None]:
reviews_annotated_df = session.table('reviews_annotated')
reviews_annotated_df.limit(10).toPandas()

# Feature Engineering and Data Pre-Processing Tasks

### Transform the text-based sentiment into numerical values

In [None]:
# Create a Python UDF to bin the rating to sentiment
@udf(name='convert_rating', session=session, is_permanent=True, replace=True, stage_location='python_load')
def convert_rating(x: str) -> int:
    if x == 'NEGATIVE': return -1
    elif x == 'NEUTRAL': return 0
    elif x == 'POSITIVE': return 1

### Stop Word Removal
We want to remove text that is relevant for readers but not for out machine learning algorithm. In English for example, this includes punctuation and articles such as a & the - which are typically referred to as stop words. To do this we create a Python UDF and use the spaCy library to process the review text.

In [None]:
import spacy

session.add_import('@model_data/en_core_web_sm.zip.gz')

@udf(name='remove_stopwords_vect', packages=['spacy==2.3.5', 'cachetools'], session=session, is_permanent=True, repalce=True, stage_location='python_load')
def remove_stopwords_vect(raw_text: PandasSeries[str]) -> PandasSeries[str]:
    import_dir = sys._xoptions['snowflake_import_directory']
    input_file = import_dir + 'en_core_web_sm.zip'
    output_dir = '/tmp/en_core_web_sm' + str(os.getpid())
    
    with zipfile.ZipFile(input_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        
    nlp = spacy.load(output_dir + "/en_core_web_sm/en_core_web_sm-2.3.0")
    stop_words = nlp.Defaults.stop_words
    
    result = []
    
    for s in raw_text:
        doc = nlp(s)
        text = [str(t.lemma_) for t in doc if 
                t not in stop_words
                and not t.is_punct
                and not t.is_currency
                and t.lemma_ != '-PRON-']
        result.append(' '.join(token.lower() for token in text))
        
    return pandas.Series(result)

### Apply the UDFs to process the data
We can execute the functions so that all the processing runs inside Snowflake, and to confirm the sentiment values are converted and stop words have been removed we can create a quick preview of the table using a Snowpark query:

In [None]:
reviews_annoted_df.select( \ 
    'product_id',
    'product_review',
     call_udf("remove_stopwords_vect", col('product_review')).alias('processed_text'),
     'sentiment',
     call_udf("convert_rating", col('sentiment')).alias('sentiment_rating')).limit(5).toPandas()

In [None]:
# pre-process 20 million rows in reviews_annotated and write it to a new training table
reviews_annoted_df.select( \ 
    'product_id',
    'product_review',
     call_udf("remove_stopwords_vect", col('product_review')).alias('processed_text'),
     'sentiment',
     call_udf("convert_rating", col('sentiment')).alias('sentiment_rating')).write.mode('overwrite').save_as_table('reviews_training')

---

# Train and Deploy a Sentiment Analysis Model
Let's look at how we are able to execute model training inside Snowflake

### Snowpark code for model training

In [None]:
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

def train_sentiment_model(session: snowflake.snowpark.Session) -> float:
    # retrieve data from processed table
    df_raw = session.table('REVIEWS_TRAINING')
    
    # split data
    train, test = df_raw.random_split([0.8, 0.2], seed=42).toPandas()
    
    # vectorize the review text
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    train_matrix = vectorizer.fit_transform(train['PROCESSED_TEXT'])
    test_matrix = vectorizer.transform(test['PROCESSED_TEXT'])
    
    #split feature and lable
    x_train = train_matrix
    x_test = test_matrix
    y_train = train['SENTIMENT']
    y_test = test['SENTIMENT']
    
    # Regression Model
    lr = LogisticRegression(multi_class='multinomial', max_iter=10000)
    lr.fit(x_train, y_train)
    
    predictions = lr.predict(x_test)
    
    # Save vectorized embeddings and model files to stage
    save_file(session, lr, '@MODEL_DATA/sentiment_large.pickle')
    save_file(session, vectorizer, '@MODEL_DATA/vectorizer_large.pickle')
    
    return accuracy_score(y_test, predictions)

### Deploy model training

In [None]:
# Change to High Memory Warehouse
session.use_warehouse('wh_high_mem')

In [None]:
# Register the Stored Procedure
session.sproc.register(name='train_sentiment_model',
                      func=train_sentiment_model,
                      packages=['snowflake-snowpark-python', 'pandas', 'scikit-learn', 'cachetools'],
                      replace=True,
                      is_permanent=True,
                      stage_location='python_load')

In [None]:
# Call the stored proc and run the model training
session.cal('train_sentiment_model')

We can see the memory used in the training process exceeded 16

In [None]:
df_log = session.table('raw.training_log') \ 
            .select(
                col('timestamp'),
                col('memory_usage_gb')).toPandas()

import seaborn as sns

sns.set(rc = {'figure.figsize': (25, 8)})
sns.lineplot(x='TIMESTAMP', y='MEMORY_USAGE_GB', data=df_log)

# Using the trained model

In [None]:
# Create a vectorized UDF to predict sentiment
session.clear_packages()
session.clear_imports()
session.add_import('@MODEL_DATA/sentiment_large.pickle')
session.add_import('@MODEL_DATA/vectorizer_large.pickle')

@udf(name='predict_sentiment_vect', packages=['pandas', 'scikit-learn'],\
    session=session, is_permanent=True, replace=True, stage_location='python_load',\
    max_batch_size=100000, input_types=[PandasSeriesType(StringType())], return_type=PandasSeriesType(VariantType()))
def predict_sentiment_vector(sentiment_str):
    model_file_path = sys.xoptions.get("snowflake_import_directory") + 'sentiment_large.pickle'
    vector_file_path = sys.xoptions.get("snowflake_import_directory") + 'vectorizer_large.pickle'

    with open(model_file_path, 'rb') as fm:
        model = pickle.load(fm)
        
    with open(vector_file_path, 'rb') as fv:
        vectorizer = pickle.load(fv)
        
    result = []
    
    for s in sentiment_str:
        matrix = vectorizer.transform([s])
        
        df= pd.DataFrame(model.predict_proba(matrix), columns=('NEGATIVE', 'NEUTRAL', 'POSITIVE'))
        
        response = df.loc[0].to_json()
        result.append(json.loads(response))

    return pandas.Series(result)

### Let's quickly score new reviews that have been pre-processed to get the sentiment scores

In [None]:
session.table('raw.new_processed_reviews') \ 
    .select(
        col('product_id'),
        col('review_data'),
        col('product_review'),
        call_udf(
            'predict_sentiment_vect',
            col('processed_words')).cast(VariantType()).alias('sentiment')) \
    .select(
        col('product_id'),
        col('review_data'),
        col('product_review'),
        col('sentiment')['NEGATIVE'].alias('negative'),
        col('sentiment')['NEUTRAL'].alias('neutral'),
        col('sentiment')['POSITIVE'].alias('positive')).limit(10).toPandas()

### Scoring large batch of data and write to a new table

In [None]:
session.table('raw.new_processed_reviews') \ 
    .select(
        col('product_id'),
        col('review_data'),
        col('product_review'),
        call_udf(
            'predict_sentiment_vect',
            col('processed_words')).cast(VariantType()).alias('sentiment')) \
    .select(
        col('product_id'),
        col('review_data'),
        col('product_review'),
        col('sentiment')['NEGATIVE'].alias('negative'),
        col('sentiment')['NEUTRAL'].alias('neutral'),
        col('sentiment')['POSITIVE'].alias('positive')).write.saveAsTable('scored_reviews', mode='overwrite')