### If running on AWS SageMaker, download the required libraries:

In [13]:
#!pip install -r de_requirements.txt

### Import libraries and custom classes

In [16]:
import os
import sys
import pandas as pd
sys.path.append('..')
import lib.snowpark_conn as spn
import lib.snowpark_runner as spr
import lib.file_helper as fh
import lib.aws_helper as aws

from dotenv import load_dotenv
from datetime import datetime

### Get Snowpark username and password

##### If running on AWS Sagemaker, get the values from Parameter Store

##### If running locally, get the values from your environment variables

In [2]:
# IF RUNNING ON AWS Sagemaker:

# aws_helper = aws.AWSHelper(region = 'us-east-2')
# parameter_names = ['uvarnd-snowpark-username', 'uvarnd-snowpark-password']
# parameters = aws_helper.get_parameters(parameter_names)

# USER = parameters['uvarnd-snowpark-username']
# PASSWORD = parameters['uvarnd-snowpark-password']

# IF RUNNING ON LOCAL:
# Change depending on enivornment variable names
load_dotenv()

USER = os.getenv('SP_USER')
PASSWORD = os.getenv('PASSWORD')

### Create a Snowpark connection using the SnowparkConnector class

In [3]:
# Create Snowpark Connection
sp_conn = spn.SnowparkConnector(
    ACCOUNT = 'hum.us-east-2.aws',
    USER = USER,
    PASSWORD = PASSWORD,
    ROLE = 'RUP',
    WAREHOUSE = 'UVARND_WH',
    DATABASE = 'CORE',
    SCHEMA = 'CLIENT'
)

### Create a SnowparkRunner object

This class contains all the Snowpark queries used for this analysis

In [4]:
# Create Snowpark Runner
sp_runner = spr.SnowparkRunner(sp_conn)

### Set threshold

The threshold stands for the minimum number of events used to query the users. For this iteration of the model we are using `16` which is roughly equivalent to 4 full article reads.

In [5]:
THRESHOLD = 16

### Extract classification dataset

This dataset will be used for the MLP model. All features are based on a user's first **16** events.

Note that this query extracts data from the **start of 2022** to the **present**.

In [6]:
classification_data = sp_runner.query_classification_dataset(THRESHOLD)

Querying from Snowpark...
Snowpark query done


In [7]:
classification_data

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F16,DISTINCT_ARTICLES_F16,PERCENT_GOOGLE_ARTICLES_F16,PERCENT_ARTICLE_CONTENT_F16,AVERAGE_CONTENT_SCORE_F16,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,EVENT_TIME_16,DISTINCT_DAYS_F16
0,--oINYIBUgM-564PCT_X,1,0,1,2,1.000000,1.000000,0.00,1,16,2022-07-25 11:04:56.852977,2022-07-26 05:58:49.636661,2
1,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.000000,0.00,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2
2,--7egIIBGoM3uBpFKICG,1,0,1,4,1.000000,1.000000,0.00,5,63,2022-08-09 04:30:20.132357,2022-08-14 21:53:19.151456,3
3,--VwtYUBwWEOklUH0r7u,1,0,3,4,0.750000,1.000000,0.00,18,16,2023-01-15 12:39:08.498457,2023-02-02 16:38:58.476149,4
4,--iEtIMBGoM3uBpFEpsG,1,0,4,3,0.000000,1.000000,0.00,109,16,2022-10-07 22:14:59.731847,2023-01-24 19:39:01.060469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142527,zxtxKYQBwWEOklUHY9No,1,0,1,2,0.000000,1.000000,0.00,0,52,2022-10-30 15:10:08.024318,2022-10-30 15:21:27.509941,1
142528,zyl3_oEBuqp2E9ofVDxM,1,0,3,1,0.000000,1.000000,0.00,18,16,2022-07-14 20:47:22.374402,2022-08-01 15:55:51.194533,4
142529,zw0BoIYBwWEOklUHVSd-,1,0,1,1,0.000000,1.000000,0.00,8,21,2023-03-02 01:48:07.933678,2023-03-10 05:48:57.620802,4
142530,zxd8hoIBGoM3uBpF1MRU,1,0,2,2,0.500000,0.222222,23153.25,64,55,2022-08-10 06:41:45.518444,2022-10-13 23:11:26.568897,2


### Extract clustering dataset

This dataset will be used to build the 2 clusters from the k-means model. The query makes use of all of a user's events on the platform.

Note that this query also extracts data from the **start of 2022** to the **present**.

In [8]:
clustering_data = sp_runner.query_clustering_dataset(THRESHOLD)

Querying from Snowpark...
Snowpark query done


In [9]:
clustering_data

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_ALL,DISTINCT_ARTICLES_ALL,PERCENT_GOOGLE_ARTICLES_ALL,PERCENT_ARTICLE_CONTENT_ALL,AVERAGE_CONTENT_SCORE_ALL,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,LATEST_EVENT_TIME,DISTINCT_DAYS
0,--oINYIBUgM-564PCT_X,1,0,1,2,1.000000,1.000000,0.0,1,16,2022-07-25 11:04:56.852977,2022-07-26 05:58:49.636661,2
1,--bztoUBwWEOklUHcAu7,1,1,4,6,0.000000,1.000000,0.0,5,32,2023-01-15 19:41:28.062509,2023-03-28 20:16:17.420397,6
2,--fCf4IBGoM3uBpF-O2H,1,0,7,4,0.000000,1.000000,0.0,54,58,2022-08-08 23:21:01.538726,2023-02-11 09:14:34.449250,11
3,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.000000,0.0,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4
4,--GZ7IIB28LFeUkXTMrZ,1,0,2,1,0.000000,1.000000,0.0,7,16,2022-08-30 02:34:07.121719,2022-09-06 12:38:50.044481,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142527,zoD9rIIBGoM3uBpFuy-3,1,0,7,10,0.700000,1.000000,0.0,9,47,2022-08-17 18:08:06.453231,2022-12-04 10:06:47.979379,11
142528,zqB_KoUBEEdskq5qlRUN,1,1,2,3,0.333333,1.000000,0.0,101,17,2022-12-19 13:08:03.779795,2023-03-30 18:38:47.498104,3
142529,zsE4DIMB28LFeUkXgK-v,1,1,4,1,0.000000,1.000000,0.0,5,32,2022-09-05 05:56:15.066663,2023-03-13 02:54:33.890363,7
142530,zuQDaoMB28LFeUkXDaC1,1,0,4,2,0.500000,0.400000,33615.0,110,19,2022-09-23 11:02:09.489146,2023-01-11 10:03:29.005302,5


### Note: The clustering dataset must have the same length as the classification dataset.

### Generate proper file names

In [10]:
s3_path = 's3://hum-rnd-data/UVA/'

date_today = datetime.now().strftime('%Y-%m-%d')
classification_file_name = 'classification_data_{}.csv'.format(date_today)
clustering_file_name = 'clustering_data_{}.csv'.format(date_today)

# If running on AWS SageMaker / Saving to S3
# classification_file_name = s3_path + classification_file_name
# clustering_file_name = s3_path + clustering_file_name


### Save files to CSV

In [12]:
classification_data.to_csv(classification_file_name, index = 0)
clustering_data.to_csv(clustering_file_name, index = 0)

### Another Option: Batch upload CSV files to S3/local to save on memory

#### 1. Get raw Snowpark query results

In [6]:
classification_results = sp_runner.query_classification_dataset_raw(THRESHOLD)

Querying from Snowpark...
Snowpark query done


In [8]:
clustering_results = sp_runner.query_clustering_dataset_raw(THRESHOLD)

Querying from Snowpark...
Snowpark query done


#### 2. Create FileHelper object

In [9]:
file_helper = fh.FileHelper()

#### 3. Batch create CSV files

In [11]:
batched_classification_file_name = 'batched_classification_data_{}.csv'.format(date_today)
batched_clustering_file_name = 'batched_clustering_data_{}.csv'.format(date_today)

# If running on AWS SageMaker / Saving to S3
# batched_classification_file_name = s3_path + classification_file_name
# batched_clustering_file_name = s3_path + clustering_file_name

In [13]:
file_helper.batch_upload_snowpark_as_csv(classification_results, batched_classification_file_name)

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15


In [14]:
file_helper.batch_upload_snowpark_as_csv(clustering_results, batched_clustering_file_name)

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15


#### Check if file created correctly

In [17]:
batched_classification_df = pd.read_csv(batched_classification_file_name)
batched_classification_df

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F16,DISTINCT_ARTICLES_F16,PERCENT_GOOGLE_ARTICLES_F16,PERCENT_ARTICLE_CONTENT_F16,AVERAGE_CONTENT_SCORE_F16,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,EVENT_TIME_16,DISTINCT_DAYS_F16
0,--oINYIBUgM-564PCT_X,1,0,1,2,1.000000,1.000000,0.00,1,16,2022-07-25 11:04:56.852977,2022-07-26 05:58:49.636661,2
1,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.000000,0.00,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2
2,--7egIIBGoM3uBpFKICG,1,0,1,4,1.000000,1.000000,0.00,5,63,2022-08-09 04:30:20.132357,2022-08-14 21:53:19.151456,3
3,--VwtYUBwWEOklUH0r7u,1,0,3,4,0.750000,1.000000,0.00,18,16,2023-01-15 12:39:08.498457,2023-02-02 16:38:58.476149,4
4,--iEtIMBGoM3uBpFEpsG,1,0,4,3,0.000000,1.000000,0.00,109,16,2022-10-07 22:14:59.731847,2023-01-24 19:39:01.060469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142527,zo07B4UBEEdskq5q3i-V,1,1,2,2,0.000000,1.000000,0.00,24,86,2022-12-12 16:47:23.327153,2023-01-05 20:24:06.295754,2
142528,zoD9rIIBGoM3uBpFuy-3,1,0,3,3,0.666667,1.000000,0.00,9,47,2022-08-17 18:08:06.453231,2022-08-26 13:38:21.074052,3
142529,zw0BoIYBwWEOklUHVSd-,1,0,1,1,0.000000,1.000000,0.00,8,21,2023-03-02 01:48:07.933678,2023-03-10 05:48:57.620802,4
142530,zxd8hoIBGoM3uBpF1MRU,1,0,2,2,0.500000,0.222222,23153.25,64,55,2022-08-10 06:41:45.518444,2022-10-13 23:11:26.568897,2


In [18]:
batched_clustering_df = pd.read_csv(batched_clustering_file_name)
batched_clustering_df

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_ALL,DISTINCT_ARTICLES_ALL,PERCENT_GOOGLE_ARTICLES_ALL,PERCENT_ARTICLE_CONTENT_ALL,AVERAGE_CONTENT_SCORE_ALL,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,LATEST_EVENT_TIME,DISTINCT_DAYS
0,--oINYIBUgM-564PCT_X,1,0,1,2,1.000000,1.0,0.0,1,16,2022-07-25 11:04:56.852977,2022-07-26 05:58:49.636661,2
1,--bztoUBwWEOklUHcAu7,1,1,4,6,0.000000,1.0,0.0,5,32,2023-01-15 19:41:28.062509,2023-03-28 20:16:17.420397,6
2,--fCf4IBGoM3uBpF-O2H,1,0,7,4,0.000000,1.0,0.0,54,58,2022-08-08 23:21:01.538726,2023-02-11 09:14:34.449250,11
3,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.0,0.0,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4
4,--GZ7IIB28LFeUkXTMrZ,1,0,2,1,0.000000,1.0,0.0,7,16,2022-08-30 02:34:07.121719,2022-09-06 12:38:50.044481,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142527,zoD9rIIBGoM3uBpFuy-3,1,0,7,10,0.700000,1.0,0.0,9,47,2022-08-17 18:08:06.453231,2022-12-04 10:06:47.979379,11
142528,zqB_KoUBEEdskq5qlRUN,1,1,2,3,0.333333,1.0,0.0,101,17,2022-12-19 13:08:03.779795,2023-03-30 18:38:47.498104,3
142529,zsE4DIMB28LFeUkXgK-v,1,1,4,1,0.000000,1.0,0.0,5,32,2022-09-05 05:56:15.066663,2023-03-13 02:54:33.890363,7
142530,zuQDaoMB28LFeUkXDaC1,1,0,4,2,0.500000,0.4,33615.0,110,19,2022-09-23 11:02:09.489146,2023-01-11 10:03:29.005302,5


### Close Snowpark session

In [19]:
sp_conn.close_session()