In [3]:
import numpy as np
import torch
np.set_printoptions(linewidth=400)
np.set_printoptions(suppress=True)
torch.set_printoptions(linewidth=400)
torch.set_printoptions(sci_mode=False)
LABEL_INDEX = 1
TOTAL_EVENTS_INDEX = 2
import boto3

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

SCALED_COLS =[
    'timestamp',
    'time_diff_seconds',
    '30_minute_session_count',
    '5_minute_session_count',
    'task_within_session_count',
    'user_count',
    'project_count',
    'country_count',
]

GENERATED_COLS = [
    'cum_events',
    'cum_projects',
    'cum_time',
    'cum_time_within_session',
    'av_time_across_clicks',
    'av_time_across_clicks_session',
    'rolling_average_tasks_within_session',
    'rolling_av_time_within_session',
    'rolling_time_between_sessions',
]

In [5]:
client = boto3.client('s3')

In [6]:
type(client)

botocore.client.S3

In [7]:
# %load npz_extractor.py
import logging
import os
import zipfile

import boto3
import numpy as np
import torch
from data_module import ClickstreamDataModule
from data_module import ClickstreamDataset
np.set_printoptions(suppress=True, precision=3)
import pdb
class NPZExtractor:
    def __init__(self, input_path, n_files, n_sequences, s3_client, data_partition=None) -> None:
        self.input_path = input_path
        self.n_files = n_files
        self.n_sequences = n_sequences
        self.s3_client = s3_client
        self.data_partition = data_partition


    def get_dataset_pointer(self):

        read_path = os.path.join(self.input_path, f'files_used_{self.n_files}')
        if not os.path.exists(read_path):
            print(f'Creating directory: {read_path}')
            os.makedirs(read_path)


        for _ in range(0, self.n_sequences +1, 10):
            key_zip, key_npy = (
                os.path.join(read_path, f'sequence_index_{_}.npz'),
                os.path.join(read_path, f'sequence_index_{_}/arr_0.npy')
            )

            print(f'Loading pointer to dataset: {key_npy}: derived from {key_zip}')

            if not os.path.exists(key_npy):
                print(f'Zip file to extract: {key_zip}: npy file to load: {key_npy}')
                self.s3_client.download_file(
                    'dissertation-data-dmiller',
                    key_zip,
                    key_zip
                )
                print(f'Zip file downloaded: {key_zip}')
                self._zip_extract(key_zip, key_npy)

        lz_concatenated_results = self._lazy_concatenate()

        if self.data_partition is not None:
            print(f'Returning partition based on {self.data_partition}')
            return lz_concatenated_results[:self.data_partition]

        return lz_concatenated_results

    def _zip_extract(self, key_zip, key_npy):
        print(f'Extracting file: {key_zip} -> {key_npy}')

        with zipfile.ZipFile(key_zip, 'r') as zip_ref:
            zip_ref.extractall(path=key_npy, members=['arr_0.npy'])

        print(f'Zip file exracted: {key_zip} -> {key_npy}/arr_0.npy')

    def _lazy_concatenate(self):
        lz_concat = []
        for _ in range(0, self.n_sequences +1, 10):
            path_to_load = os.path.join(self.input_path, f'files_used_{self.n_files}', f'sequence_index_{_}', f'arr_0.npy')
            lz_concat.append(np.load(path_to_load))
        return lz_concat


# extractor = NPZExtractor(
#     'torch_ready_data_4',
#     5,
#     10,
#     boto3.client('s3')
# )

# lz_concatenated_results = extractor.get_dataset_pointer()

In [8]:
def extract_info(tensor, n_sequences):

    label, total_events, user_id, project_id, country, features, shifters = (
        tensor[:, 0],tensor[:, 1], tensor[:, 2], 
        tensor[:, 3], tensor[:, 4], tensor[:, 5:5+17], tensor[:, 5+17:]
    )

    shifters = torch.reshape(shifters, (shifters.shape[0], n_sequences-1, 18))
    shifter_project_id, shifter_features = shifters[:, :, 0], shifters[:, :, 1:]

    features, user_id, project_id, country = (
        torch.flip(torch.cat((features.unsqueeze(1), shifter_features), dim=1), dims=[1]),
        user_id.repeat(n_sequences).unsqueeze(1),
        torch.flip(torch.cat((project_id.unsqueeze(1), shifter_project_id.unsqueeze(1)), dim=1), dims=[1]),
        country.repeat(n_sequences).unsqueeze(1)
    )

    display(features.shape, user_id.shape, project_id.shape, country.shape)
    user_id, country = torch.where(project_id == 0, 0, user_id), torch.where(project_id == 0, 0, country)

    return {
        'label': label,
        'total_events': total_events,
        'user_id': user_id,
        'project_id': project_id,
        'country': country,
        'features': features
    }

In [11]:
extractor = NPZExtractor(
    'torch_ready_data_4',
    5,
    10,
    boto3.client('s3')
)

lz_concatenated_results = extractor.get_dataset_pointer()

Loading pointer to dataset: torch_ready_data_4/files_used_5/sequence_index_0/arr_0.npy: derived from torch_ready_data_4/files_used_5/sequence_index_0.npz
Loading pointer to dataset: torch_ready_data_4/files_used_5/sequence_index_10/arr_0.npy: derived from torch_ready_data_4/files_used_5/sequence_index_10.npz


In [12]:
results = np.concatenate(lz_concatenated_results, axis=1)
results = torch.tensor(results)

In [14]:
sample = results[:]

In [15]:
label, total_events, user_id, project_id, country, features, shifters = sample[:, 0], sample[:, 1], sample[:, 2], sample[:, 3], sample[:, 4], sample[:, 5:5+17], sample[:, 5+17:]
shifters = torch.reshape(shifters, (shifters.shape[0], 10, 18))
shifter_project_id, shifter_features = shifters[:, :, 0], shifters[:, :, 1:]

In [16]:
features, user_id, project_id, country = features.unsqueeze(1), user_id.unsqueeze(1), project_id.unsqueeze(1), country.unsqueeze(1)

In [17]:
features = torch.cat((features, shifter_features), dim=1)
features = torch.flip(features, dims=[1])

In [18]:
user_id, project, country, = user_id.repeat(1, 11), torch.flip(torch.cat((project_id, shifter_project_id), dim=1), dims=[1]), country.repeat(1, 11)

In [19]:
user_id, country = torch.where(project == 0, 0, user_id), torch.where(project == 0, 0, country)


In [21]:
user_id[0]

tensor([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 239.])