### If running on AWS SageMaker, download the required libraries:

In [13]:
#!pip install -r de_requirements.txt

### Import libraries and custom classes

In [1]:
import os
import sys
import pandas as pd
sys.path.append('..')
import lib.snowpark_conn as spn
import lib.snowpark_runner as spr
import lib.file_helper as fh
import lib.aws_helper as aws

from dotenv import load_dotenv
from datetime import datetime

### Get Snowpark username and password

##### If running on AWS Sagemaker, get the values from Parameter Store

##### If running locally, get the values from your environment variables

In [2]:
# IF RUNNING ON AWS Sagemaker:

# aws_helper = aws.AWSHelper(region = 'us-east-2')
# parameter_names = ['uvarnd-snowpark-username', 'uvarnd-snowpark-password']
# parameters = aws_helper.get_parameters(parameter_names)

# USER = parameters['uvarnd-snowpark-username']
# PASSWORD = parameters['uvarnd-snowpark-password']

# IF RUNNING ON LOCAL:
# Change depending on enivornment variable names
load_dotenv()

USER = os.getenv('SP_USER')
PASSWORD = os.getenv('PASSWORD')

### Create a Snowpark connection using the SnowparkConnector class

In [3]:
# Create Snowpark Connection
sp_conn = spn.SnowparkConnector(
    ACCOUNT = 'hum.us-east-2.aws',
    USER = USER,
    PASSWORD = PASSWORD,
    ROLE = 'RUP',
    WAREHOUSE = 'UVARND_WH',
    DATABASE = 'CORE',
    SCHEMA = 'CLIENT'
)

### Create a SnowparkRunner object

This class contains all the Snowpark queries used for this analysis

In [4]:
# Create Snowpark Runner
sp_runner = spr.SnowparkRunner(sp_conn)

### Set threshold

The threshold stands for the minimum number of events used to query the users. For this iteration of the model we are using `16` which is roughly equivalent to 4 full article reads.

Use `100` as upper threshold to eliminate outliers.

Limit query to `March 31, 2023` for consistency.

In [5]:
THRESHOLD = 16
LIMIT_THRESHOLD = 100
END_DATE = '2023-03-31'

### Extract classification dataset

This dataset will be used for the MLP model. All features are based on a user's first **16** events.

Note that this query extracts data from the **start of 2022** to the **present**.

In [6]:
classification_data = sp_runner.query_classification_dataset(THRESHOLD, LIMIT_THRESHOLD, END_DATE)

Querying from Snowpark...
Snowpark query done


In [7]:
classification_data

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F16,DISTINCT_ARTICLES_F16,PERCENT_GOOGLE_ARTICLES_F16,PERCENT_ARTICLE_CONTENT_F16,AVERAGE_CONTENT_SCORE_F16,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,EVENT_TIME_16,DISTINCT_DAYS_F16,ARTICLES_PER_EVENT_F16,EVENT_DENSITY_F16
0,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.000000,0.0,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2,0.125000,8.000000
1,--6dYIIB8Tq1gs32cnxN,1,0,1,1,1.000000,1.000000,0.0,7,16,2022-08-02 22:11:48.120981,2022-08-09 16:19:25.990118,4,0.062500,4.000000
2,--7CGYYBRkWobPtw2Qjb,1,1,3,1,1.000000,1.000000,0.0,27,29,2023-02-04 00:10:45.911994,2023-03-03 04:29:56.373198,3,0.062500,5.333333
3,--7egIIBGoM3uBpFKICG,1,0,1,4,1.000000,1.000000,0.0,5,63,2022-08-09 04:30:20.132357,2022-08-14 21:53:19.151456,3,0.250000,5.333333
4,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.000000,0.0,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4,0.125000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135540,zzlKjIIBGoM3uBpFBagQ,1,1,3,3,0.666667,1.000000,0.0,133,40,2022-08-11 09:43:58.953482,2022-12-22 09:37:29.844389,4,0.187500,4.000000
135541,zzpjDYQBEEdskq5qGMuW,1,0,2,1,0.000000,1.000000,0.0,17,28,2022-10-25 04:25:09.614383,2022-11-11 13:42:32.907519,7,0.062500,2.285714
135542,zzrPDIQBEEdskq5qZRER,1,1,5,3,0.333333,1.000000,0.0,129,18,2022-10-25 01:43:50.150416,2023-03-03 07:59:42.533219,4,0.187500,4.000000
135543,zzsQPYQBwWEOklUHCNVJ,1,0,1,0,0.000000,0.000000,0.0,2,56,2022-11-03 10:36:12.009305,2022-11-05 03:42:30.198338,3,0.000000,5.333333


### Extract clustering dataset

This dataset will be used to build the 2 clusters from the k-means model. The query makes use of all of a user's events on the platform.

Note that this query also extracts data from the **start of 2022** to the **present**.

In [8]:
clustering_data = sp_runner.query_clustering_dataset(THRESHOLD, LIMIT_THRESHOLD, END_DATE)

Querying from Snowpark...
Snowpark query done


In [9]:
clustering_data

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_ALL,DISTINCT_ARTICLES_ALL,PERCENT_GOOGLE_ARTICLES_ALL,PERCENT_ARTICLE_CONTENT_ALL,AVERAGE_CONTENT_SCORE_ALL,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,LATEST_EVENT_TIME,DISTINCT_DAYS,ARTICLES_PER_EVENT_ALL,EVENT_DENSITY_ALL
0,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.000000,0.000000,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2,0.125000,8.000000
1,--6dYIIB8Tq1gs32cnxN,1,0,1,1,1.000000,1.000000,0.000000,7,16,2022-08-02 22:11:48.120981,2022-08-09 16:19:25.990118,4,0.062500,4.000000
2,--7CGYYBRkWobPtw2Qjb,1,1,5,1,1.000000,1.000000,0.000000,27,29,2023-02-04 00:10:45.911994,2023-03-30 05:31:48.525783,6,0.034483,4.833333
3,--7egIIBGoM3uBpFKICG,1,0,11,11,1.000000,0.916667,1383.888889,5,63,2022-08-09 04:30:20.132357,2023-01-04 17:26:37.806226,15,0.174603,4.200000
4,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.000000,0.000000,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4,0.125000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135540,zzlKjIIBGoM3uBpFBagQ,1,1,6,5,0.800000,1.000000,0.000000,133,40,2022-08-11 09:43:58.953482,2023-03-25 12:01:41.668617,7,0.125000,5.714286
135541,zzpjDYQBEEdskq5qGMuW,1,0,5,1,0.000000,1.000000,0.000000,17,28,2022-10-25 04:25:09.614383,2023-02-11 00:25:21.186098,12,0.035714,2.333333
135542,zzrPDIQBEEdskq5qZRER,1,1,5,4,0.250000,1.000000,0.000000,129,18,2022-10-25 01:43:50.150416,2023-03-29 09:18:49.188192,5,0.222222,3.600000
135543,zzsQPYQBwWEOklUHCNVJ,1,0,1,0,0.000000,0.000000,0.000000,2,56,2022-11-03 10:36:12.009305,2022-11-08 11:59:11.293372,6,0.000000,9.333333


### Note: The clustering dataset must have the same length as the classification dataset.

### Generate proper file names

In [11]:
s3_path = 's3://hum-rnd-data/UVA/'

date_today = datetime.now().strftime('%Y-%m-%d')
classification_file_name = 'classification_data_{}.csv'.format(END_DATE)
clustering_file_name = 'clustering_data_{}.csv'.format(END_DATE)

# If running on AWS SageMaker / Saving to S3
# classification_file_name = s3_path + classification_file_name
# clustering_file_name = s3_path + clustering_file_name


### Save files to CSV

In [12]:
classification_data.to_csv(classification_file_name, index = 0)
clustering_data.to_csv(clustering_file_name, index = 0)

### Another Option: Batch upload CSV files to S3/local to save on memory

#### 1. Get raw Snowpark query results

In [13]:
classification_results = sp_runner.query_classification_dataset_raw(THRESHOLD, LIMIT_THRESHOLD, END_DATE)

Querying from Snowpark...
Snowpark query done


In [14]:
clustering_results = sp_runner.query_clustering_dataset_raw(THRESHOLD, LIMIT_THRESHOLD, END_DATE)

Querying from Snowpark...
Snowpark query done


#### 2. Create FileHelper object

In [15]:
file_helper = fh.FileHelper()

#### 3. Batch create CSV files

In [17]:
batched_classification_file_name = 'batched_classification_data_{}.csv'.format(END_DATE)
batched_clustering_file_name = 'batched_clustering_data_{}.csv'.format(END_DATE)

# If running on AWS SageMaker / Saving to S3
# batched_classification_file_name = s3_path + classification_file_name
# batched_clustering_file_name = s3_path + clustering_file_name

In [18]:
file_helper.batch_upload_snowpark_as_csv(classification_results, batched_classification_file_name)

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14


In [19]:
file_helper.batch_upload_snowpark_as_csv(clustering_results, batched_clustering_file_name)

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14


#### Check if file created correctly

In [20]:
batched_classification_df = pd.read_csv(batched_classification_file_name)
batched_classification_df

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F16,DISTINCT_ARTICLES_F16,PERCENT_GOOGLE_ARTICLES_F16,PERCENT_ARTICLE_CONTENT_F16,AVERAGE_CONTENT_SCORE_F16,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,EVENT_TIME_16,DISTINCT_DAYS_F16,ARTICLES_PER_EVENT_F16,EVENT_DENSITY_F16
0,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.0,0.0,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2,0.1250,8.000000
1,--6dYIIB8Tq1gs32cnxN,1,0,1,1,1.000000,1.0,0.0,7,16,2022-08-02 22:11:48.120981,2022-08-09 16:19:25.990118,4,0.0625,4.000000
2,--7CGYYBRkWobPtw2Qjb,1,1,3,1,1.000000,1.0,0.0,27,29,2023-02-04 00:10:45.911994,2023-03-03 04:29:56.373198,3,0.0625,5.333333
3,--7egIIBGoM3uBpFKICG,1,0,1,4,1.000000,1.0,0.0,5,63,2022-08-09 04:30:20.132357,2022-08-14 21:53:19.151456,3,0.2500,5.333333
4,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.0,0.0,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4,0.1250,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135540,zzlKjIIBGoM3uBpFBagQ,1,1,3,3,0.666667,1.0,0.0,133,40,2022-08-11 09:43:58.953482,2022-12-22 09:37:29.844389,4,0.1875,4.000000
135541,zzpjDYQBEEdskq5qGMuW,1,0,2,1,0.000000,1.0,0.0,17,28,2022-10-25 04:25:09.614383,2022-11-11 13:42:32.907519,7,0.0625,2.285714
135542,zzrPDIQBEEdskq5qZRER,1,1,5,3,0.333333,1.0,0.0,129,18,2022-10-25 01:43:50.150416,2023-03-03 07:59:42.533219,4,0.1875,4.000000
135543,zzsQPYQBwWEOklUHCNVJ,1,0,1,0,0.000000,0.0,0.0,2,56,2022-11-03 10:36:12.009305,2022-11-05 03:42:30.198338,3,0.0000,5.333333


In [21]:
batched_clustering_df = pd.read_csv(batched_clustering_file_name)
batched_clustering_df

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_ALL,DISTINCT_ARTICLES_ALL,PERCENT_GOOGLE_ARTICLES_ALL,PERCENT_ARTICLE_CONTENT_ALL,AVERAGE_CONTENT_SCORE_ALL,DAYS_TO_16_EVENTS,EVENTS,FIRST_EVENT_TIME,LATEST_EVENT_TIME,DISTINCT_DAYS,ARTICLES_PER_EVENT_ALL,EVENT_DENSITY_ALL
0,--0p74IB28LFeUkXSg6b,1,0,2,2,0.00,1.000000,0.000000,133,16,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219,2,0.125000,8.000000
1,--6dYIIB8Tq1gs32cnxN,1,0,1,1,1.00,1.000000,0.000000,7,16,2022-08-02 22:11:48.120981,2022-08-09 16:19:25.990118,4,0.062500,4.000000
2,--7CGYYBRkWobPtw2Qjb,1,1,5,1,1.00,1.000000,0.000000,27,29,2023-02-04 00:10:45.911994,2023-03-30 05:31:48.525783,6,0.034483,4.833333
3,--7egIIBGoM3uBpFKICG,1,0,11,11,1.00,0.916667,1383.888889,5,63,2022-08-09 04:30:20.132357,2023-01-04 17:26:37.806226,15,0.174603,4.200000
4,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.00,1.000000,0.000000,103,16,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668,4,0.125000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135540,zzlKjIIBGoM3uBpFBagQ,1,1,6,5,0.80,1.000000,0.000000,133,40,2022-08-11 09:43:58.953482,2023-03-25 12:01:41.668617,7,0.125000,5.714286
135541,zzpjDYQBEEdskq5qGMuW,1,0,5,1,0.00,1.000000,0.000000,17,28,2022-10-25 04:25:09.614383,2023-02-11 00:25:21.186098,12,0.035714,2.333333
135542,zzrPDIQBEEdskq5qZRER,1,1,5,4,0.25,1.000000,0.000000,129,18,2022-10-25 01:43:50.150416,2023-03-29 09:18:49.188192,5,0.222222,3.600000
135543,zzsQPYQBwWEOklUHCNVJ,1,0,1,0,0.00,0.000000,0.000000,2,56,2022-11-03 10:36:12.009305,2022-11-08 11:59:11.293372,6,0.000000,9.333333


### Close Snowpark session

In [22]:
sp_conn.close_session()