In [1]:
import os
import numpy as np
import pandas as pd
from textwrap import dedent
import matplotlib.pyplot as plt
import seaborn as sns
from clan_tools.secrets.Vault import Vault
from clan_tools.data_adapters.YTAdapter import YTAdapter
from clan_tools.data_adapters.YQLAdapter import YQLAdapter

In [2]:
os.environ['NUMEXPR_MAX_THREADS'] = '16'
Vault().get_secrets()
yt_adapter = YTAdapter()
yql_adapter = YQLAdapter()

In [3]:
def week_nums_list(table):
    df =  yql_adapter.execute_query(dedent(f"""
        SELECT DISTINCT br_week_num
        FROM hahn.`home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/{table}`
        ORDER BY br_week_num
    """), to_pandas=True)
    return df.values.ravel()

def req(week_num, table):
    return dedent(f"""
        SELECT *
        FROM hahn.`home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/{table}`
        WHERE br_week_num = {week_num}
    """)

for table in ['train', 'test', 'oot']:
    week_nums = week_nums_list(table)
    for week_num in week_nums:
        print(table, week_num)
        df =  yql_adapter.execute_query(req(week_num, table), to_pandas=True)
        print('\tShape:', df.shape)
        df.to_pickle(f'Sample/{table}_{week_num}.pkl')

train 0
	Shape: (57914, 144)
train 1
	Shape: (58631, 144)
train 2
	Shape: (59637, 144)
train 3
	Shape: (60569, 144)
train 4
	Shape: (61231, 144)
train 5
	Shape: (61800, 144)
train 6
	Shape: (62593, 144)
train 7
	Shape: (63535, 144)
test 0
	Shape: (24542, 144)
test 1
	Shape: (24868, 144)
test 2
	Shape: (25264, 144)
test 3
	Shape: (25639, 144)
test 4
	Shape: (25924, 144)
test 5
	Shape: (26153, 144)
test 6
	Shape: (26460, 144)
test 7
	Shape: (26837, 144)
oot 8
	Shape: (27211, 144)
oot 9
	Shape: (27688, 144)


In [4]:
for table in  ['train', 'test', 'oot']:
    files = []
    for file in os.listdir('Sample'):
        if file.find(table) < 0:
            continue
        files.append(pd.read_pickle(f'Sample/{file}'))
    df = pd.concat(files, axis=0)
    df = df.reset_index(drop=True)
    print(table, '-', df.shape)
    df.to_pickle(f'Final_sample/{table}.pkl')

train - (485910, 144)
test - (205687, 144)
oot - (54899, 144)
