# Establish Secure Connection to Snowflake

### Create a conda virtual environment and install snowpark and other dependencies
`conda create --name snowpark_env -c https://repo.anaconda.com/pkgs/snowflake python=3.8 numpy pandas cachetools spacy scikit-learn`
<br>
`conda activate snowpark_env`
<br>
`pip install "snowflake-snowpark-python[pandas]"`
<br>
`python -m spacy download en_core_web_sm`

### packages that Snowflake Anaconda doesn't support yet
`pip install spacytextblob`
<br>
`pip install pytextrank`
### Add virtual env to ipykernel for Jupyter Notebook to use
`conda install -c anaconda ipykernel`
<br>
`python -m ipykernel install --user --name=snowpark_env`
### Create stage "python_load" and stage "model_data" to store Python UDFs and trained model files
Instructions on how to create a stage on Snowflake: https://hevodata.com/learn/snowflake-stages/
### Upload spacy pretrained NLP pipeline to a named stage from local computer
`put file://C:\Users\an.jiang\src\snowpark_demo\spacy_trained_models\model_for_upload\en_core_web_sm.zip @model_data AUTO_COMPRESS=TRUE OVERWRITE = TRUE;`
<br>
How to use PUSH command in snowsql cli: https://docs.snowflake.com/en/sql-reference/sql/put.html
### Use Snowflake Dashboard and Chart tools of Snowsight to visualize Zendesk ticket sentiment trend
https://hevodata.com/learn/snowflake-dashboards/
### CI/CD pipelines i.e., automatic data load and model deployment using Streams and Tasks in Snowflake
https://community.snowflake.com/s/article/Using-Streams-and-Tasks-inside-Snowflake
https://docs.snowflake.com/en/user-guide/data-pipelines-intro.html
https://hevodata.com/learn/snowflake-triggers/

In [1]:
# Snowpark for Python
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf, col, call_udf
from snowflake.snowpark.types import *

# Others
import sys, string, io, os, math
import zipfile
import pickle
import numpy as np
import pandas as pd
import json
from cachetools import cached

# increase the max number of columns to display - default 20, switch to truncate view if exceeded
pd.set_option('display.max_columns', 50)
# increase the width of the column so we can see more raw text - default 50 characters
pd.set_option('display.max_colwidth', 500)

In [2]:
connection_parameters = {
    "account": 'wne',
    "user": 'an.jiang@imaginelearning.com',
    "authenticator": 'externalbrowser',
    # "role": os.environ["SNOWFLAKE_ROLE"]
    "warehouse": 'RESEARCH_WH', # RESEARCH_WH
    "database": 'SANDBOX',
    "schema": 'SCRATCH'
  }
# connection_parameters = json.load(open(r'C:\Users\an.jiang\src\snowflake_credentials.json'))

In [3]:
# Create Snowflake Session object
session = Session.builder.configs(connection_parameters).create()
print(session.sql("select current_warehouse(), current_database(), current_schema()").collect())

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
[Row(CURRENT_WAREHOUSE()='RESEARCH_WH', CURRENT_DATABASE()='SANDBOX', CURRENT_SCHEMA()='SCRATCH')]


----

===================================================================================================================

# Read unstructured Zendesk data 

In [55]:
df_zendesk = session.table('zendesk_tickets')
df_zendesk.limit(5).toPandas()

Unnamed: 0,ID,URL,EXTERNAL_ID,TYPE,SUBJECT,RAW_SUBJECT,DESCRIPTION,PRIORITY,STATUS,RECIPIENT,REQUESTER_ID,SUBMITTER_ID,ASSIGNEE_ID,ORGANIZATION_ID,GROUP_ID,COLLABORATOR_IDS,FOLLOWER_IDS,EMAIL_CC_IDS,FORUM_TOPIC_ID,PROBLEM_ID,HAS_INCIDENTS,DUE_AT,TAGS,VIA,CUSTOM_FIELDS,SATISFACTION_RATING,SHARING_AGREEMENT_IDS,FOLLOWUP_IDS,TICKET_FORM_ID,BRAND_ID,ALLOW_CHANNELBACK,ALLOW_ATTACHMENTS,IS_PUBLIC,CREATED_AT,UPDATED_AT,ROW_CHECKSUM
0,2054590,https://edgenuity.zendesk.com/api/v2/tickets/2054590.json,,question,"Cannot complete a bypass, unlock an assessment, add a retake, or complete another administrative action.","Cannot complete a bypass, unlock an assessment, add a retake, or complete another administrative action.",This issue was reported from LMS\Educator.\r\nDistrict ID = 8106\r\nSchool ID = 28457\r\nSchool Name = CLEVELAND HIGH SCHOOL\r\nTeacher User ID = 345305615\r\nPhone = \r\nSession ID = 75594651\r\nStudent Name = Christian Valdes Alvarez\r\nStudent User ID = 345763293\r\nCourse Name = TX-Foundations of Personal Fitness\r\nCourse ID = bb71387f-4385-436f-adff-888419bd68fd\r\nSubject = Electives\r\nMaster Build Key = 4d00172c-7361-6574-7242-75696c640000\r\nCourse Edition = Current\r\nCourse Node ...,normal,closed,,373744457413,373744457413,28632285008,360004300000.0,360002530974,[],[],[],,,False,NaT,"[\n ""contact_type_educator"",\n ""courseware_technical_issues"",\n ""customer_state_texas"",\n ""did_8106"",\n ""lms_help_center"",\n ""notify_assignee_solved"",\n ""product_courseware"",\n ""rai_no_admin_action"",\n ""realm_05"",\n ""standard_school"",\n ""user_teacher""\n]","{\n ""channel"": ""api"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 1900006981125\n },\n {\n ""id"": 1500003320582\n },\n {\n ""id"": 1500007156742,\n ""value"": ""false""\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 1900000749065\n },\n {\n ""i...","{\n ""score"": ""offered""\n}",[],[],765848,6738268,False,True,True,2022-05-02 03:06:10,2022-05-14 17:03:20,02310f5980f160b2a28e30f040d6b1ee62047a31d707560efda1352124e2843c
1,2054672,https://edgenuity.zendesk.com/api/v2/tickets/2054672.json,,question,Re: Imagine Learning - Instructional Services Implementation Recap,Re: Imagine Learning - Instructional Services Implementation Recap,"This is a follow-up to your previous request #2045292 ""Re: Meeting today""\n\nGood morning,\nWe have determined that it is in the best interest of J. Hudson that the course enrollments be changed to the credit recovery versions for semester 1. I will make those adjustments in the platform today.\n\n\nDr. Mitchell-Williams\n\nshe/her\n\n\nGreta Mitchell-Williams, PhD\n\nExecutive Director on Assignment\n\nGrade Level Principal - OYG 2023\n\nProviso Township High Schools District 209\n\ngmitche...",normal,closed,adrienne.giddens@edgenuity.zendesk.com,1918119938805,360472755388,1909542808645,360004200000.0,360004026594,"[\n ""360505470688"",\n ""1534383819762"",\n ""360472755388""\n]",[],"[\n ""360505470688"",\n ""1534383819762"",\n ""360472755388""\n]",,,False,NaT,"[\n ""agiddens_zendesk"",\n ""agiddens_zendesk_sent"",\n ""am_reactive__district_request"",\n ""amact_other"",\n ""content_cue_12567456-245e-5a1d-57bd-78dbcd126ce4"",\n ""customer_state_illinois"",\n ""did_11401"",\n ""product_courseware"",\n ""realm_12"",\n ""sc_ticket_up"",\n ""school_administrator"",\n ""solved_to_open"",\n ""standard_school"",\n ""status_enabled""\n]","{\n ""channel"": ""web"",\n ""source"": {\n ""from"": {\n ""subject"": ""Re: Meeting today"",\n ""ticket_id"": ""2045292""\n },\n ""rel"": ""follow_up"",\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313,\n ""value"": """"\n },\n {\n ""id"": 360053495554,\n ""value"": """"\n },\n {\n ""id"": 360046824194,\n ""value"": """"\n },\n {\n ""id"": 360052996354,\n ""value"": """"\n },\n {\n ""id"": 360031704834,\n ""value"": """"\n },\n {\n ""id"": 360053998853,\n ""value"": """"\n },\n {\n ""id"": 360033966853,\n ""value"": """"\n },\n {\n ""id"": 1900006981125,\n ""value"": """"\n },\n {\n ""id"": 1500003320582,\n ""value"": """"\n },\n {\n ""id"":...","{\n ""score"": ""unoffered""\n}",[],[],1500002916222,6738268,False,True,True,2022-05-02 12:57:49,2022-05-14 13:02:01,91b64c0dba4e627536101df8fe4de87391619baabdecc03e3a7c6ca4846bec26
2,2054680,https://edgenuity.zendesk.com/api/v2/tickets/2054680.json,,question,Issue Found,Issue Found,"Use the US Travel and Tourism Industries: A Year in Review 2010, US Department of Commerce, International Trade Administration, as your main source. - This source no longer works. It is from 2010...the course should probably be updated..... Thanks!\n\n\n# Course Info:\n**Course Name:** Career Explorations I v.21\n**Unit Name:** 3. INTRODUCTION TO HOSPITALITY AND TOURISM SYSTEMS\n**Assignment Name:** 4. Project: Travel Terminology\n\n\n# Customer Info:\n**Email:** andrew.weigold@imaginelearni...",normal,closed,,426289826293,426289826293,362309404187,360004300000.0,360003198953,[],[],[],,,False,NaT,"[\n ""contact_type_internal_employee"",\n ""content_electives"",\n ""content_support_no_eta_ign"",\n ""contentsupport_broken_link"",\n ""dg-macro-360138413753"",\n ""dg-macro-360139278254"",\n ""dg-macro-360139350233"",\n ""dg_agent_1508380235501"",\n ""dg_macro_ai_360139350233"",\n ""dg_macro_reply_1"",\n ""dg_macro_search_360139350233"",\n ""dg_macro_used"",\n ""is_help_center"",\n ""jira_escalated"",\n ""lms_help_center"",\n ""mpng_help_center"",\n ""notfy_rquestr_commnt_updte"",\n ""odysseyware_issue_fou...","{\n ""channel"": ""api"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 1900006981125\n },\n {\n ""id"": 1500003320582\n },\n {\n ""id"": 1500007156742,\n ""value"": ""false""\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 1900000749065\n },\n {\n ""i...","{\n ""score"": ""unoffered""\n}",[],[],360001274434,360004529234,False,True,True,2022-05-02 13:01:52,2022-05-11 01:01:48,9b10b7318ceeddc7c1cb26bbce1314d54ebf467f803c13bfc62d53435cc5a5b9
3,2054929,https://edgenuity.zendesk.com/api/v2/tickets/2054929.json,,question,Chat with Jaxon McCarver,Chat with Jaxon McCarver,"Chat started: 2022-05-02 02:44 PM UTC\nServed by: Tutor - Andrea\n\nIP: 76.8.237.122\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36\nCountry: United States\nCity: Olive Branch\nURL: https://r12.core.learn.edgenuity.com/Player/\n\nChat ID: 2205.1801676.T4gj0KcusJBdR\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,closed,,5818371722903,5818371722903,360683087288,,43851087,[],[],[],,,False,NaT,"[\n ""activitynameinstruction"",\n ""cc_ended_chat_solved"",\n ""concept_coaching_tutoring"",\n ""contact_type_student"",\n ""courseid155331357"",\n ""coursenamems_turner_6th_grade_english"",\n ""districtid11481"",\n ""edgenuity"",\n ""fullnamejaxon_mccarver"",\n ""lessonnamenarrative_structure_in_ltemgtholesltemgt"",\n ""schoolid42857"",\n ""schoolnamesouthhaven_middle_tutor"",\n ""sessionid38023741"",\n ""subjectlanguage_arts"",\n ""trigger_1500011690542"",\n ""typestudent"",\n ""userid135836290"",\n ""zop...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 1900006981125\n },\n {\n ""id"": 1500003320582\n },\n {\n ""id"": 1500007156742,\n ""value"": ""false""\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 1900000749065\n },\n {\n ""i...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360002067674,False,True,True,2022-05-02 14:45:27,2022-05-06 16:03:32,ff06ecac9d3f403952fb708a5af78c7ca2ce08e0bb99023aba2fdd1c058a71e0
4,2054950,https://edgenuity.zendesk.com/api/v2/tickets/2054950.json,,question,Chat with Visitor 57298444,Chat with Visitor 57298444,Chat started: 2022-05-02 02:53 PM UTC\nServed by: Tutor-Mrs. Gibbons\n\nIP: 173.221.101.98\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0\nCountry: United States\nCity: Vero Beach\nURL: https://alliancepartners.zendesk.com/hc/en-us\n\nChat ID: 2205.1801676.T4gl77DxEavFA\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.,normal,closed,,5818664763543,5818664763543,420074885834,,43851087,[],[],[],,,False,NaT,"[\n ""alliance_partners"",\n ""cc_ended_chat_solved"",\n ""cc_transfer_4"",\n ""chat_alliancetutors"",\n ""concept_coaching_tutoring"",\n ""zopim_chat"",\n ""zopim_chat_ended""\n]","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 1900006981125\n },\n {\n ""id"": 1500003320582\n },\n {\n ""id"": 1500007156742,\n ""value"": ""false""\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 1900000749065\n },\n {\n ""i...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360004307694,False,True,True,2022-05-02 14:54:56,2022-05-06 17:03:22,1be7581c19547854854d7133b04925bf4ee1be7c93adecb0419184abe62f954b


In [56]:
df_zendesk = df_zendesk.toPandas()

In [57]:
df_zendesk.head()

Unnamed: 0,ID,URL,EXTERNAL_ID,TYPE,SUBJECT,RAW_SUBJECT,DESCRIPTION,PRIORITY,STATUS,RECIPIENT,REQUESTER_ID,SUBMITTER_ID,ASSIGNEE_ID,ORGANIZATION_ID,GROUP_ID,COLLABORATOR_IDS,FOLLOWER_IDS,EMAIL_CC_IDS,FORUM_TOPIC_ID,PROBLEM_ID,HAS_INCIDENTS,DUE_AT,TAGS,VIA,CUSTOM_FIELDS,SATISFACTION_RATING,SHARING_AGREEMENT_IDS,FOLLOWUP_IDS,TICKET_FORM_ID,BRAND_ID,ALLOW_CHANNELBACK,ALLOW_ATTACHMENTS,IS_PUBLIC,CREATED_AT,UPDATED_AT,ROW_CHECKSUM
0,1359153,https://edgenuity.zendesk.com/api/v2/tickets/1359153.json,,,Chat with Michael Kello,Chat with Michael Kello,"Chat started: 2021-02-09 01:54 PM UTC\nServed by: Tennis Watkins\n\nIP: 162.43.210.183\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36\nCountry: United States\nCity: Scottsville\nURL: https://edgenuity.ada.support/chat/?introShown=true&embed2=1\n\nChat ID: 2102.1801676.SOWpGXuZKTIf6\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,closed,,364554494833,364554494833,363276100000.0,360037900000.0,360012700000.0,[],[],[],,,False,NaT,"[\n ""18849"",\n ""58741"",\n ""ada"",\n ""adachat"",\n ""allen_county-scottsville_high_school"",\n ""autosolve_open_chat"",\n ""did_18849"",\n ""edgenuity"",\n ""httpsr10corelearnedgenuitycomeducatorstudenttoolsdashboardaspx"",\n ""il_chat"",\n ""lms_help_center"",\n ""michael_kello"",\n ""michaelkelloallenkyschoolsus"",\n ""no_csat"",\n ""product_courseware"",\n ""ps_upd_notice"",\n ""realm_10"",\n ""standard_school"",\n ""teacher"",\n ""windows"",\n ""zopim_chat"",\n ""zopim_chat_ended"",\n ""zopimtrigger_imag...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 360053996553\n },\n {\n ""id"": 360053496074\n },\n {\n ""id"": 360053565194\n },\n {\n ""id"": 360052996874\n },\n {\n ""id"": 360049465354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],765848,6738268,False,True,True,2021-02-09 13:56:48,2021-02-13 16:03:01,7355a77c083a8e48bbc94c36070e40f8517dbf4bac0354e788d7dc36b301deee
1,1359282,https://edgenuity.zendesk.com/api/v2/tickets/1359282.json,,question,Payment Received - Tatyana Christensen - Edgenuity Inc.,Payment Received - Tatyana Christensen - Edgenuity Inc.,Payment has been received for an Edgenuity Inc. online course. Thank you!\n\nInvoice Date 2/8/2021\nDate Paid 2/8/2021\nInvoice # 799869\nStatus Paid In Full\nDistrict 732124 Edgenuity Virtual Academy\nStudent Name Tatyana Christensen\nEmail annec1968@msn.com\nStudent ID Tatyana Christensen\nPaid Amount 350.00\nMemo 2/08/21-03/07/21 - FT Monthly - Tatyana Christensen - EVA\n\nYou requested to receive an email alert for this record. Click here to modify your email alerts (https://1291624.app....,normal,closed,admissions@edgenuity.com,1500696329002,378327750514,364202400000.0,360102700000.0,360004500000.0,[],[],[],,,False,NaT,"[\n ""did_20003"",\n ""is"",\n ""is_admissions__finance_payment_made"",\n ""is_assignee_change"",\n ""realm_05""\n]","{\n ""channel"": ""email"",\n ""source"": {\n ""from"": {\n ""address"": ""annec1968@msn.com"",\n ""name"": ""Anne Christensen""\n },\n ""to"": {\n ""address"": ""admissions@edgenuity.com"",\n ""name"": ""Edgenuity""\n }\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 360053996553\n },\n {\n ""id"": 360053496074\n },\n {\n ""id"": 360053565194\n },\n {\n ""id"": 360052996874\n },\n {\n ""id"": 360049465354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360002067674,False,True,False,2021-02-09 14:44:12,2021-02-13 16:02:38,5421adb00c2003d96a7cafa2ea9a4cc985f7d2850bb9ea7c73d5df4137aa37e3
2,1359365,https://edgenuity.zendesk.com/api/v2/tickets/1359365.json,,question,Chat with Jade Perkins,Chat with Jade Perkins,"Chat started: 2021-02-09 03:02 PM UTC\nServed by: Brittany Libell\n\nIP: 100.34.205.171\nUser Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15\nCountry: United States\nCity: Ambler\nURL: https://r06.core.learn.edgenuity.com/Player/\n\nChat ID: 2102.1801676.SOX6bl4ppjkih\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,closed,,1502305669141,1502305669141,421372600000.0,,43851090.0,[],[],[],,,False,NaT,"[\n ""activitynameassignment"",\n ""activitynameinstruction"",\n ""cc_ended_chat_solved"",\n ""chg_is_brnd"",\n ""concept_coaching_tutoring"",\n ""courseid96828578"",\n ""coursenamepre-algebra_b"",\n ""districtid9687"",\n ""edgenuity"",\n ""fullnamejade_perkins"",\n ""il_chat"",\n ""jasontracking_360204256674"",\n ""lessonnameparallel_lines_cut_by_a_transversal"",\n ""lessonnamereflections"",\n ""notify_rqustr_of_is_cmmnt_updte"",\n ""schoolid39379"",\n ""schoolnamewissahickon_school_is"",\n ""sessionid41379...","{\n ""channel"": ""chat"",\n ""source"": {\n ""from"": null,\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 360053996553\n },\n {\n ""id"": 360053496074\n },\n {\n ""id"": 360053565194\n },\n {\n ""id"": 360052996874\n },\n {\n ""id"": 360049465354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],360001090393,360002067674,False,True,True,2021-02-09 15:03:15,2021-02-13 17:02:24,5afb34a8f1dbe9551e44a6d83666c46e719bb4fba11b007a641facb7f12c557d
3,1359647,https://edgenuity.zendesk.com/api/v2/tickets/1359647.json,,question,Feature Request: Pending Enrollment Auto-Assign Queue,Feature Request: Pending Enrollment Auto-Assign Queue,"Customer stated that CR students often prefer to see one course at a time. He enrolls them in the needed courses and then disables them so they only see one. Then when they finish, he enables the second course and so forth.\n\nHe suggested a Course Cue feature in which the second course would automatically enable once the first course is marked as Complete.\n\nAlong the same lines, if he puts in a target date for ALL of the courses to be completed, can we automatically tabulate a target da...",normal,closed,featurerequest@edgenuity.com,379930997554,379930997554,398377400000.0,360004300000.0,360012700000.0,"[\n ""398377386373""\n]","[\n ""398377386373""\n]",[],,,False,NaT,"[\n ""assign_ps_group"",\n ""courseware_feature_request"",\n ""dg-macro-360000953408"",\n ""dg-macro-360001846048"",\n ""dg-macro-360090304474"",\n ""dg-macro-360138059134"",\n ""dg_agent_398377386373"",\n ""dg_macro_ai_360000953408"",\n ""dg_macro_reply_1"",\n ""dg_macro_search_360000953408"",\n ""dg_macro_used"",\n ""featurerequest"",\n ""jira_escalated"",\n ""lms_help_center"",\n ""macro-360001846048"",\n ""no_csat"",\n ""notify_assignee_solved"",\n ""ntfy_emplyee_intrnl_nte_created"",\n ""ntfy_emplyee_int...","{\n ""channel"": ""email"",\n ""source"": {\n ""from"": {\n ""address"": ""deanna.weine@edgenuity.com"",\n ""name"": ""Deanna Weine""\n },\n ""to"": {\n ""address"": ""featurerequest@edgenuity.com"",\n ""name"": ""Edgenuity""\n }\n }\n}","[\n {\n ""id"": 360053997313\n },\n {\n ""id"": 360053495554\n },\n {\n ""id"": 360046824194\n },\n {\n ""id"": 360052996354\n },\n {\n ""id"": 360031704834\n },\n {\n ""id"": 360053998853\n },\n {\n ""id"": 360033966853\n },\n {\n ""id"": 360000070407\n },\n {\n ""id"": 360047024393\n },\n {\n ""id"": 360053996553\n },\n {\n ""id"": 360053496074\n },\n {\n ""id"": 360053565194\n },\n {\n ""id"": 360052996874\n },\n {\n ""id"": 360049465354\n },\n {...","{\n ""score"": ""unoffered""\n}",[],[],765848,6738268,False,True,True,2021-02-09 16:15:07,2021-03-28 03:02:40,fd2a5a903eb7b0f1423387e3ca34652371878d0f35eda4370ddfd0e7e84a8688
4,1359845,https://edgenuity.zendesk.com/api/v2/tickets/1359845.json,,task,count day report 2/10/2021,count day report 2/10/2021,"This is a follow-up to your previous request #778242 ""Progress For specific Dates...""\n\n\nHi Whitney,\n\n\nI just wanted to give you a heads up. I will need our standard count day report of all students and their activities completed on 2/10/2021 (the full 24 hours.) I will need that report as soon as possible Thursday morning 2/11/2021.\n\n\n\nAlso any luck with the custom report request for 19-20 students requesting all time logged-in ever?\n\n\n\n--\n\nKeith Platte\n\nInformation Sys...",normal,closed,whitney.roessel@edgenuity.zendesk.com,367493859734,367493859734,411798400000.0,360002500000.0,360000000000.0,"[\n ""362553068013"",\n ""361387076988""\n]","[\n ""362553068013"",\n ""361387076988""\n]",[],,,False,NaT,"[\n ""am_followup_ticket"",\n ""am_inbound_email"",\n ""am_reactive__district_request"",\n ""am_taylor.olson"",\n ""am_tchpnt_upd"",\n ""amact_data_ssrs_report"",\n ""assign_ps_group"",\n ""did_8683"",\n ""district_administrator"",\n ""lms_help_center"",\n ""product_courseware"",\n ""realm_18"",\n ""standard_school"",\n ""tchpnt_trgr"",\n ""wroessel_zendesk"",\n ""wroessel_zendesk_sent""\n]","{\n ""channel"": ""web"",\n ""source"": {\n ""from"": {\n ""subject"": ""Progress For specific Dates - Report Requested"",\n ""ticket_id"": ""778242""\n },\n ""rel"": ""follow_up"",\n ""to"": null\n }\n}","[\n {\n ""id"": 360053997313,\n ""value"": """"\n },\n {\n ""id"": 360053495554,\n ""value"": """"\n },\n {\n ""id"": 360046824194,\n ""value"": """"\n },\n {\n ""id"": 360052996354,\n ""value"": """"\n },\n {\n ""id"": 360031704834,\n ""value"": """"\n },\n {\n ""id"": 360053998853,\n ""value"": """"\n },\n {\n ""id"": 360033966853,\n ""value"": """"\n },\n {\n ""id"": 360000070407,\n ""value"": """"\n },\n {\n ""id"": 360047024393,\n ""value"": """"\n },\n {\n ""id"": 3...","{\n ""score"": ""unoffered""\n}",[],[],360000483473,6738268,False,True,True,2021-02-09 17:02:10,2021-02-15 17:03:09,47f8b90e10b360dd7bc9d62ce5baf6fa951dff11cbc3f755f402f7b2f1d01766


# Data transformation to extract reviews/comments from raw data
This part should be able to be written as Python UDFs for Snowpark DataFrame
<br>
Here I simply write the transformation process explicitly for simplicity

## Extract comments, reasons and scores in the SATISFACTION_RATING column

In [58]:
rating_dicts = df_zendesk["SATISFACTION_RATING"].apply(json.loads) # Apply json.loads function along each row of the DataFrame(here is PandasSeries)
comments, reasons, scores = rating_dicts.apply(lambda x: x.get('comment', np.nan)), rating_dicts.apply(lambda x: x.get('reason', np.nan)), rating_dicts.apply(lambda x: x.get('score', np.nan))

##  Insert them into the DataFrame

In [59]:
insert_index = df_zendesk.columns.get_loc("SATISFACTION_RATING") + 1
df_zendesk.insert(insert_index, column="SCORE", value = scores)
df_zendesk.insert(insert_index, column="REASON", value = reasons)
df_zendesk.insert(insert_index, column="COMMENT", value = comments)

## Filter rows by comment and score columns

### By comment - drop rows with no comments

In [60]:
# replace an empty string and whitespaces ((spaces, tabs and new lines)) with nan
df_zendesk["COMMENT"].replace(r'^\s*$', np.nan, regex=True, inplace=True)
# Drop rows with no comments
df_zendesk.dropna(subset=['COMMENT'], inplace=True)

### By Score - turns out that all comments have been labeled 
"unoffered" and "offered" labels have no corresponding comments, which means we don't have unlabeled data

In [61]:
unique_scores = list(df_zendesk['SCORE'].unique())
print("The unique values for column SCORE are: {}".format(unique_scores))
# The unique values for column SCORE before droppng rows with no comments are:
# ['unoffered' 'offered' 'good' 'bad']

The unique values for column SCORE are: ['good', 'bad']


In [62]:
# select "good" and "bad" as two labels for sentiment
# "unoffered" and "offered" do not provide any sentiment information
df_zendesk_annotated = df_zendesk.loc[df_zendesk["SCORE"].isin(['good', 'bad'])]

### Drop duplicates

In [63]:
# Drop duplicate comments except the first occurence.
df_zendesk_annotated.drop_duplicates(subset=['COMMENT'], keep='first', inplace=True)

### Reset index

In [64]:
# Reset index
df_zendesk_annotated.reset_index(drop=True, inplace=True)

## Select columns

In [65]:
# Select columns
columns = ['ID', 'TYPE', 'SUBJECT', 'DESCRIPTION', 'PRIORITY', 'COMMENT', 'REASON', 'SCORE', 'CREATED_AT', 'UPDATED_AT']
df_zendesk_annotated = df_zendesk_annotated[columns]

In [66]:
df_zendesk_annotated.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45883 entries, 0 to 45882
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           45883 non-null  int32         
 1   TYPE         43061 non-null  object        
 2   SUBJECT      45883 non-null  object        
 3   DESCRIPTION  45883 non-null  object        
 4   PRIORITY     45883 non-null  object        
 5   COMMENT      45883 non-null  object        
 6   REASON       45883 non-null  object        
 7   SCORE        45883 non-null  object        
 8   CREATED_AT   45883 non-null  datetime64[ns]
 9   UPDATED_AT   45883 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int32(1), object(7)
memory usage: 3.3+ MB


## Convert datetime to date datatype for constructing Snowpark dataframe later 
__datetime datatype is too big to convert and time part is not necessary__

In [100]:
df_zendesk_annotated["CREATED_AT"] = df_zendesk_annotated["CREATED_AT"].dt.date
df_zendesk_annotated["UPDATED_AT"] = df_zendesk_annotated["UPDATED_AT"].dt.date

## Turn Pandas DataFrame back to Snowpark DataFrame and save it to a table
$\color{red}{\text{Somehow datatime64[ns] datatype will convert to int64 when save as a Snowflake table, need to figure out how to specify datatype}}$
<br>
<font color='green'>Somehow datatime64[ns] datatype will convert to int64 when save as a Snowflake table, need to figure out how to specify datatype</font>

In [109]:
from snowflake.snowpark.types import IntegerType, StringType, DateType, TimeType, TimestampType, StructType, StructField
schema = StructType([StructField("ID", IntegerType()), StructField("TYPE", StringType()), StructField("SUBJECT", StringType()),\
                     StructField("DESCRIPTION", StringType()), StructField("PRIORITY", StringType()), StructField("COMMENT", StringType()),\
                     StructField("REASON", StringType()), StructField("SCORE", StringType()),\
                     StructField("CREATED_AT", DateType()), StructField("UPDATED_AT", DateType())])
# create_dataframe accepts list or pandas dataframe as value
df = session.create_dataframe(df_zendesk_annotated, schema) 

In [114]:
df.limit(1).toPandas()

Unnamed: 0,ID,TYPE,SUBJECT,DESCRIPTION,PRIORITY,COMMENT,REASON,SCORE,CREATED_AT,UPDATED_AT
0,1359965,question,Chat with Noah Fullbright,"Chat started: 2021-02-09 05:24 PM UTC\nServed by: Tutor Jennifer\n\nIP: 71.0.149.34\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 13421.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.199 Safari/537.36\nCountry: United States\nCity: The Villages\nURL: https://r18.core.learn.edgenuity.com/Player/\n\nChat ID: 2102.1801676.SOXgDUbin4DQN\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,rude\n,No reason provided,good,2021-02-09,2021-02-13


In [112]:
# To save the contents of a DataFrame to a table
df.write.mode("overwrite").save_as_table("reviews_annotated")

---

======================================================================================================================

# Explore Historical Data
Lets look at the REVIEWS_ANNOTATED table which has the manually annotated sentiment labels for each customer review.

In [115]:
# Create a DataFrame from data in a table
df = session.table("reviews_annotated") # Snowpark DataFrame
df.limit(2).toPandas()

Unnamed: 0,ID,TYPE,SUBJECT,DESCRIPTION,PRIORITY,COMMENT,REASON,SCORE,CREATED_AT,UPDATED_AT
0,1359965,question,Chat with Noah Fullbright,"Chat started: 2021-02-09 05:24 PM UTC\nServed by: Tutor Jennifer\n\nIP: 71.0.149.34\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 13421.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.199 Safari/537.36\nCountry: United States\nCity: The Villages\nURL: https://r18.core.learn.edgenuity.com/Player/\n\nChat ID: 2102.1801676.SOXgDUbin4DQN\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,rude\n,No reason provided,good,2021-02-09,2021-02-13
1,1360001,,District Admin in Language and Literacy,"Chat started: 2021-02-09 05:40 PM UTC\nServed by: Jacob\n\nIP: 166.127.1.100\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36\nCountry: United States\nCity: Houston\nURL: https://help.imaginelearning.com/hc/en-us/requests/new\n\nChat ID: 2102.1801676.SOXkJB7HBpUOg\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,Great customer care. Quick responses.,No reason provided,good,2021-02-09,2021-02-13


In [116]:
df.toPandas().info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45883 entries, 0 to 45882
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           45883 non-null  int32 
 1   TYPE         43061 non-null  object
 2   SUBJECT      45883 non-null  object
 3   DESCRIPTION  45883 non-null  object
 4   PRIORITY     45883 non-null  object
 5   COMMENT      45883 non-null  object
 6   REASON       45883 non-null  object
 7   SCORE        45883 non-null  object
 8   CREATED_AT   45883 non-null  object
 9   UPDATED_AT   45883 non-null  object
dtypes: int32(1), object(9)
memory usage: 3.3+ MB


# Feature Engineering and Data Pre-Processing Tasks

### Transform the text-based sentiment into numerical values

In [117]:
# Create a Python UDF to bin the rating to sentiment
@udf(name='convert_rating', session=session, is_permanent=True, replace=True, stage_location='python_load')
def convert_rating(x: str) -> int:
    if x == 'good': return 1
    elif x == 'bad': return -1

In [118]:
df.select('ID', 'COMMENT', 'SCORE', call_udf("convert_rating", col('SCORE')).alias('sentiment_rating')).limit(5).toPandas()

Unnamed: 0,ID,COMMENT,SCORE,SENTIMENT_RATING
0,1359965,rude\n,good,1
1,1360001,Great customer care. Quick responses.,good,1
2,1361695,.,good,1
3,1362382,Thank you for the fast reset! I appreciate you helping me. Have a great day.,good,1
4,1371423,I'm not sure that my question was answered.,good,1


### Stop Word Removal
We want to remove text that is relevant for readers but not for our machine learning algorithm. In English for example, this includes punctuation and articles such as a & the - which are typically referred to as stop words. To do this we create a Python UDF and use the spaCy library to process the review text.

In [119]:
import spacy

session.add_import('@model_data/en_core_web_sm.zip.gz')

@udf(name='remove_stopwords_vect', packages=['spacy==3.3.1', 'cachetools'], session=session, is_permanent=True, replace=True, stage_location='python_load')
def remove_stopwords_vect(raw_text: PandasSeries[str]) -> PandasSeries[str]:
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    input_file = import_dir + 'en_core_web_sm.zip'
    output_dir = '/tmp/en_core_web_sm' + str(os.getpid())
    
    with zipfile.ZipFile(input_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        
#     nlp = spacy.load(output_dir + "/en_core_web_sm/en_core_web_sm-3.3.0")
    nlp = spacy.load(output_dir + "/en_core_web_sm-3.3.0")
    stop_words = nlp.Defaults.stop_words
    
    result = []
    
    for s in raw_text:
        doc = nlp(s)
        text = [str(t.lemma_) for t in doc if 
                t not in stop_words
                and not t.is_punct
                and not t.is_currency
                and t.lemma_ != '-PRON-']
        result.append(' '.join(token.lower() for token in text))
        
    return pandas.Series(result)

In [120]:
df.select('ID', 'COMMENT', 'SCORE', \
          call_udf("remove_stopwords_vect", col('COMMENT')).alias('processed_text')).limit(5).toPandas()

Unnamed: 0,ID,COMMENT,SCORE,PROCESSED_TEXT
0,1359965,rude\n,good,rude \n
1,1360001,Great customer care. Quick responses.,good,great customer care quick response
2,1361695,.,good,
3,1362382,Thank you for the fast reset! I appreciate you helping me. Have a great day.,good,thank you for the fast reset i appreciate you help i have a great day
4,1371423,I'm not sure that my question was answered.,good,i be not sure that my question be answer


--------------------

### Apply the UDFs to process the data
We can execute the functions so that all the processing runs inside Snowflake, and to confirm the sentiment values are converted and stop words have been removed we can create a quick preview of the table using a Snowpark query:

In [121]:
df.select('ID', 'COMMENT',  
     call_udf("remove_stopwords_vect", col('COMMENT')).alias('PROCESSED_TEXT'),\
     'SCORE',\
     call_udf("convert_rating", col('SCORE')).alias('SENTIMENT_RATING')).limit(5).toPandas()

Unnamed: 0,ID,COMMENT,PROCESSED_TEXT,SCORE,SENTIMENT_RATING
0,1359965,rude\n,rude \n,good,1
1,1360001,Great customer care. Quick responses.,great customer care quick response,good,1
2,1361695,.,,good,1
3,1362382,Thank you for the fast reset! I appreciate you helping me. Have a great day.,thank you for the fast reset i appreciate you help i have a great day,good,1
4,1371423,I'm not sure that my question was answered.,i be not sure that my question be answer,good,1


In [128]:
# pre-process 45,000+ rows in reviews_annotated and write it to a new training table
df_processed = df.select('ID', 'TYPE', 'SUBJECT', 'DESCRIPTION', 'PRIORITY',\
                         'REASON', 'COMMENT',\
                         call_udf("remove_stopwords_vect", col('COMMENT')).alias('PROCESSED_TEXT'),\
                         'SCORE',\
                         call_udf("convert_rating", col('SCORE')).alias('SENTIMENT_RATING'),\
                         'CREATED_AT', 'UPDATED_AT')

__remove_stopwords_vect function may produce empty string in the PROCESSED_TEXT column, need to remove the corresponding rows__

In [129]:
training_df = df_processed.filter(col("PROCESSED_TEXT") != "")

In [130]:
# Save the processed dataframe to reviews_training table
training_df.write.mode('overwrite').save_as_table('reviews_training')

---

======================================================================

# Train and Deploy a Sentiment Analysis Model
Let's look at how we are able to execute model training inside Snowflake

### Snowpark code for model training

In [13]:
import numpy as np
import pickle
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

def train_sentiment_model(session: snowflake.snowpark.Session) -> float:
    # retrieve data from processed table
    df_raw = session.table('REVIEWS_TRAINING')
    
    # split data
    train, test = df_raw.random_split([0.8, 0.2], seed=42)
    train = train.toPandas()
    test = test.toPandas()
    
    # vectorize the review text
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    train_matrix = vectorizer.fit_transform(train['PROCESSED_TEXT'])
    test_matrix = vectorizer.transform(test['PROCESSED_TEXT'])
    
    #split feature and lable
    x_train = train_matrix
    x_test = test_matrix
    y_train = train['SENTIMENT_RATING']
    y_test = test['SENTIMENT_RATING']
    
    # Regression Model
    lr = LogisticRegression(multi_class='multinomial', max_iter=10000)
    lr.fit(x_train, y_train)
    
    predictions = lr.predict(x_test)
    
#     # Snowflake session seems to treat local system as read-only file system
#     model_file_path = r"C:\Users\an.jiang\src\snowpark_demo\models\sentiment_lr.pickle"
#     vectorizer_file_path = r"C:\Users\an.jiang\src\snowpark_demo\models\vectorizer_lr.pickle"
#     with open(model_file_path,'wb') as fm:
#         pickle.dump(lr, fm)
#         fm.close()
#     with open(vectorizer_file_path,'wb') as fv:
#         pickle.dump(vectorizer, fv)
#         fv.close()
#     # Save vectorized embeddings and model files to stage
#     put_result = session.file.put(model_file_path, '@MODEL_DATA/sentiment_large.pickle', overwrite=True)
#     print("Model file:", put_result[0].status)
#     put_result = session.file.put(vectorizer_file_path, '@MODEL_DATA/vectorizer_large.pickle', overwrite=True)
#     print("Vectorizer:", put_result[0].status)
    
#     # save_file is not defined in the demo!
#     save_file(session, lr, '@MODEL_DATA/sentiment_large.pickle')
#     save_file(session, vectorizer, '@MODEL_DATA/vectorizer_large.pickle')
    
    return accuracy_score(y_test, predictions)

### Deploy model training

In [8]:
# Change to High Memory Warehouse
session.use_warehouse('SUMITHA_RESEARCH_WH')

In [15]:
# Register the Stored Procedure
session.sproc.register(name='train_sentiment_model',
                      func=train_sentiment_model,
                      packages=['snowflake-snowpark-python', 'pandas', 'scikit-learn', 'cachetools'],
                      replace=True,
                      is_permanent=True,
                      stage_location='python_load')

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x23a0917a340>

In [16]:
# Call the stored proc and run the model training
session.call('train_sentiment_model')

0.9163350897946015

We can see the memory used in the training process exceeded 16

In [21]:
# df_log = session.table('raw.training_log').select(
#                 col('timestamp'),
#                 col('memory_usage_gb')).toPandas()

# import seaborn as sns

# sns.set(rc = {'figure.figsize': (25, 8)})
# sns.lineplot(x='TIMESTAMP', y='MEMORY_USAGE_GB', data=df_log)

# Using the trained model - to predict comments with unoffered and offered ratings
Because the current data set has no comments with unoffered and offered tags, we apply pseudo prediction on comments in the training and testing data set for now.

In [5]:
# Create a vectorized UDF to predict sentiment
session.clear_packages()
session.clear_imports()
session.add_import('@MODEL_DATA/sentiment_large.pickle')
session.add_import('@MODEL_DATA/vectorizer_large.pickle')

@udf(name='predict_sentiment_vect', packages=['pandas', 'scikit-learn'],\
     session=session, is_permanent=True, replace=True, stage_location='python_load',\
     max_batch_size=100000, input_types=[PandasSeriesType(StringType())], return_type=PandasSeriesType(VariantType()))
def predict_sentiment_vect(sentiment_str):
    model_file_path = sys._xoptions.get("snowflake_import_directory") + 'sentiment_large.pickle'
    vector_file_path = sys._xoptions.get("snowflake_import_directory") + 'vectorizer_large.pickle'

    with open(model_file_path, 'rb') as fm:
        model = pickle.load(fm)
        
    with open(vector_file_path, 'rb') as fv:
        vectorizer = pickle.load(fv)
        
    result = []
    
    for s in sentiment_str:
        matrix = vectorizer.transform([s])
        
        df= pd.DataFrame(model.predict_proba(matrix), columns=('NEGATIVE', 'POSITIVE'))
        
        response = df.loc[0].to_json()
        result.append(json.loads(response))

    return pandas.Series(result)

### Let's score new reviews that have been pre-processed to get the sentiment scores

In [6]:
session.table('REVIEWS_TRAINING')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        call_udf('predict_sentiment_vect', col('PROCESSED_TEXT')).cast(VariantType()).alias('sentiment'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('CREATED_AT'),\
        col('UPDATED_AT'))\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('sentiment')['NEGATIVE'].cast(FloatType()).alias('negative'),\
        col('sentiment')['POSITIVE'].cast(FloatType()).alias('positive'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).limit(2).toPandas()

Unnamed: 0,ID,TYPE,SUBJECT,DESCRIPTION,PRIORITY,REASON,COMMENT,PROCESSED_TEXT,NEGATIVE,POSITIVE,SCORE,SENTIMENT_RATING,CREATED_AT,UPDATED_AT
0,1458262,question,Chat with Markayaise Graham,"Chat started: 2021-04-16 01:33 AM UTC\nServed by: Tutor - Michael\n\nIP: 70.119.249.253\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36\nCountry: United States\nCity: DeSoto\nURL: https://r23.core.learn.edgenuity.com/Player/\n\nChat ID: 2104.1801676.SUhjWVUfuY8Cq\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,No reason provided,u suck,u suck,0.793502,0.206498,bad,-1,2021-04-16,2021-04-20
1,1467114,question,Chat with Edward Fry,"Chat started: 2021-04-23 04:00 PM UTC\nServed by: Tutor - Mrs. Misty\n\nIP: 50.44.31.66\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 13729.84.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.130 Safari/537.36\nCountry: United States\nCity: \nURL: https://r20.core.learn.edgenuity.com/Player/\n\nChat ID: 2104.1801676.SVQBVHpDar5Jp\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,No reason provided,"she helped with everything, and very smart",she help with everything and very smart,0.010598,0.989402,good,1,2021-04-23,2021-04-27


### Scoring large batch of data and write to a new table

In [7]:
session.table('REVIEWS_TRAINING')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        call_udf('predict_sentiment_vect', col('PROCESSED_TEXT')).cast(VariantType()).alias('sentiment'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('CREATED_AT'),\
        col('UPDATED_AT'))\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('sentiment')['NEGATIVE'].cast(FloatType()).alias('negative'),\
        col('sentiment')['POSITIVE'].cast(FloatType()).alias('positive'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

# Entity extraction

In [4]:
import spacy
session.add_import('@model_data/en_core_web_sm.zip.gz')

# Create a vectorized UDF to extract entities
@udf(name='extract_entity_vect', packages=['pandas', 'spacy'],\
     session=session, is_permanent=True, replace=True, stage_location='python_load',\
     max_batch_size=100000, input_types=[PandasSeriesType(StringType())], return_type=PandasSeriesType(VariantType()))
def extract_entity_vect(sentiment_str):
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    input_file = import_dir + 'en_core_web_sm.zip'
    output_dir = '/tmp/en_core_web_sm' + str(os.getpid())
    
    with zipfile.ZipFile(input_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        
    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load(output_dir + "/en_core_web_sm-3.3.0")
    
    res = []
    for text in sentiment_str:
        doc = nlp(text)
        # examine the entities in the document
        entities = [entity.text + ' | ' + entity.label_ for entity in doc.ents]
        res.append(', '.join(entities))    
    return pandas.Series(res)

# Keyphrase extraction

In [5]:
import spacy
from collections import Counter
from string import punctuation
session.add_import('@model_data/en_core_web_sm.zip.gz')

# Create a vectorized UDF to extract keyphrases
@udf(name='extract_keyphrase_vect', packages=['pandas', 'spacy'],\
     session=session, is_permanent=True, replace=True, stage_location='python_load',\
     max_batch_size=100000, input_types=[PandasSeriesType(StringType())], return_type=PandasSeriesType(VariantType()))
def extract_keyphrase_vect(sentiment_str: PandasSeries[str]) -> PandasSeries[str]:
    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
    input_file = import_dir + 'en_core_web_sm.zip'
    output_dir = '/tmp/en_core_web_sm' + str(os.getpid())
    
    with zipfile.ZipFile(input_file, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        
    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load(output_dir + "/en_core_web_sm-3.3.0")
    
    res = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN']
    for text in sentiment_str:
        words = []
        doc = nlp(text)
        for token in doc:
            if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
                continue
            if(token.pos_ in pos_tag):
                words.append(token.text)
        most_common_list = Counter(words).most_common(5)
        res.append(', '.join([item[0] for item in most_common_list]))   
    return pandas.Series(res)

In [6]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        call_udf('extract_keyphrase_vect', col('COMMENT')).alias('KEYPHRASE_COMMENT'),\
        call_udf('extract_keyphrase_vect', col('DESCRIPTION')).alias('KEYPHRASE_DESCRIPTION'),\
        call_udf('extract_entity_vect', col('COMMENT')).alias('ENTITY_COMMENT'),\
        call_udf('extract_entity_vect', col('DESCRIPTION')).alias('ENTITY_DESCRIPTION'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).limit(2).toPandas()

Unnamed: 0,ID,TYPE,SUBJECT,DESCRIPTION,PRIORITY,REASON,COMMENT,PROCESSED_TEXT,NEGATIVE,POSITIVE,SCORE,SENTIMENT_RATING,KEYPHRASE_COMMENT,KEYPHRASE_DESCRIPTION,ENTITY_COMMENT,ENTITY_DESCRIPTION,CREATED_AT,UPDATED_AT
0,1359965,question,Chat with Noah Fullbright,"Chat started: 2021-02-09 05:24 PM UTC\nServed by: Tutor Jennifer\n\nIP: 71.0.149.34\nUser Agent: Mozilla/5.0 (X11; CrOS x86_64 13421.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.199 Safari/537.36\nCountry: United States\nCity: The Villages\nURL: https://r18.core.learn.edgenuity.com/Player/\n\nChat ID: 2102.1801676.SOXgDUbin4DQN\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,No reason provided,rude\n,rude \n,0.844293,0.155707,good,1,"""rude""","""chat, Chat, PM, UTC, Tutor""","""rude | ORG""","""2021-02-09 | DATE, 05:24 PM | TIME, UTC | ORG, Jennifer\n\nIP | PERSON, CrOS x86_64 13421.102.0 | PERSON, AppleWebKit/537.36 | ORG, KHTML | ORG, Gecko | GPE, United States | GPE, Villages | ORG, 2102.1801676.SOXgDUbin4DQN | CARDINAL""",2021-02-09,2021-02-13
1,1360001,,District Admin in Language and Literacy,"Chat started: 2021-02-09 05:40 PM UTC\nServed by: Jacob\n\nIP: 166.127.1.100\nUser Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36\nCountry: United States\nCity: Houston\nURL: https://help.imaginelearning.com/hc/en-us/requests/new\n\nChat ID: 2102.1801676.SOXkJB7HBpUOg\n\nThe chat transcript will be appended when the agent or visitor leaves the chat.",normal,No reason provided,Great customer care. Quick responses.,great customer care quick response,0.000648,0.999352,good,1,"""Great, customer, care, Quick, responses""","""Chat, chat, PM, UTC, Jacob""","""""","""2021-02-09 | DATE, 05:40 PM | TIME, UTC | ORG, Jacob\n\nIP | ORG, Win64 | ORG, KHTML | ORG, Gecko | GPE, United States | GPE, Houston | GPE, 2102.1801676.SOXkJB7HBpUOg | CARDINAL""",2021-02-09,2021-02-13


In [10]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        call_udf('extract_keyphrase_vect', col('COMMENT')).alias('KEYPHRASE_COMMENT'),\
        call_udf('extract_keyphrase_vect', col('DESCRIPTION')).alias('KEYPHRASE_DESCRIPTION'),\
        call_udf('extract_entity_vect', col('COMMENT')).alias('ENTITY_COMMENT'),\
        call_udf('extract_entity_vect', col('DESCRIPTION')).alias('ENTITY_DESCRIPTION'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

Failed to execute query [queryID: None]  CREATE  OR  REPLACE    TABLE  scored_reviews AS  SELECT  *  FROM ( SELECT "ID", "TYPE", "SUBJECT", "DESCRIPTION", "PRIORITY", "REASON", "COMMENT", "PROCESSED_TEXT", "NEGATIVE", "POSITIVE", "SCORE", "SENTIMENT_RATING", extract_keyphrase_vect("COMMENT") AS "KEYPHRASE_COMMENT", extract_keyphrase_vect("DESCRIPTION") AS "KEYPHRASE_DESCRIPTION", extract_entity_vect("COMMENT") AS "ENTITY_COMMENT", extract_entity_vect("DESCRIPTION") AS "ENTITY_DESCRIPTION", "CREATED_AT", "UPDATED_AT" FROM ( SELECT  *  FROM (SCORED_REVIEWS)))
100357 (P0000): UDF available memory exhausted


SnowparkSQLException: (1304): 100357 (P0000): UDF available memory exhausted

======================================================

## UDF available memory exhausted when calling too many udfs at the same time - split calling - <font color='red'>Need a high memory warehouse</font>

In [20]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        call_udf('extract_keyphrase_vect', col('COMMENT')).alias('KEYPHRASE_COMMENT'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

In [15]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('KEYPHRASE_COMMENT'),\
        call_udf('extract_keyphrase_vect', col('DESCRIPTION')).alias('KEYPHRASE_DESCRIPTION'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

In [16]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('KEYPHRASE_COMMENT'),\
        col('KEYPHRASE_DESCRIPTION'),\
        call_udf('extract_entity_vect', col('DESCRIPTION')).alias('ENTITY_COMMENT'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

In [17]:
session.table('SCORED_REVIEWS')\
.select(col('ID'),\
        col('TYPE'),\
        col('SUBJECT'),\
        col('DESCRIPTION'),\
        col('PRIORITY'),\
        col('REASON'),\
        col('COMMENT'),\
        col('PROCESSED_TEXT'),\
        col('NEGATIVE'),\
        col('POSITIVE'),\
        col('SCORE'),\
        col('SENTIMENT_RATING'),\
        col('KEYPHRASE_COMMENT'),\
        col('KEYPHRASE_DESCRIPTION'),\
        col('ENTITY_COMMENT'),\
        call_udf('extract_entity_vect', col('DESCRIPTION')).alias('ENTITY_DESCRIPTION'),\
        col('CREATED_AT'),\
        col('UPDATED_AT')).write.saveAsTable('scored_reviews', mode='overwrite')

***

================================================================

# SQL queries to all the codes above to an automatic pipeline

In [None]:
# Create a stream on directory table
# create stream unstructured_files_stream on stage unstructured_files;
session.sql("create stream unstructured_files_stream on stage unstructured_files").collect()

In [None]:
# Create a stream on the raw_reviews staging table
# create stream raw_reviews_stg_stream on table raw_reviews_stg;
session.sql("create stream raw_reviews_stg_stream on table raw_reviews_stg").collect()

In [None]:
# Create a task to process the new unstructured files that come in.
# create or replace task read_unstructured_reviews_task warehouse = 'wh_xs' schedule = '1 minute' 
# when system$stream_has_data('unstructured_files_stream')
# as 
# insert into raw_reviews_stg
# select 
#     u.file_url,
#     t.product_id,
#     t.product_review, 
#     t.reviews_date
# from 
#     unstructured_files_stream u
# join 
#     table(read_unstructured_reviews(u.file_url)) t
# where
#     metadata$action = 'INSERT'
# ;

session.sql('''create or replace task read_unstructured_reviews_task warehouse = 'wh_xs' schedule = '1 minute' 
when system$stream_has_data('unstructured_files_stream')
as 
insert into raw_reviews_stg
select 
    u.file_url,
    t.product_id,
    t.product_review, 
    t.reviews_date
from 
    unstructured_files_stream u
join 
    table(read_unstructured_reviews(u.file_url)) t
where
    metadata$action = 'INSERT'
    ''').collect()

In [None]:
# Create a task to process the newly ingested unstructured reviews
# create or replace task score_reviews_task warehouse = 'wh_xs' schedule = '1 minute'
# when system$stream_has_data('raw_reviews_stg_stream')
# as 
# insert into annotated.scored_reviews
# select
#     product_id,
#     product_review,
#     review_date,
#     annotated.predict_sentiment_vect(
#         remove_stopword_vect(product_review)) as sentiment
# from 
#     raw_reviews_stg_stream
# where
#     metadata$action = 'INSERT'
# ;

session.sql('''create or replace task score_reviews_task warehouse = 'wh_xs' schedule = '1 minute'
when system$stream_has_data('raw_reviews_stg_stream')
as 
insert into annotated.scored_reviews
select
    ID,
    COMMENT, --customer reviews on Zendesk ticket
    UPDATED_AT, --review date,
    annotated.predict_sentiment_vect(
        remove_stopword_vect(COMMENT)) as sentiment
from 
    raw_reviews_stg_stream
where
    metadata$action = 'INSERT'
    ''').collect()

In [None]:
# Start the tasks
# alter task read_unstructured_reviews_task resume;
# alter task score_reviews_task resume;
session.sql("alter task read_unstructured_reviews_task resume").collect()
session.sql("alter task score_reviews_task resume").collect()