In [1]:
# !gsutil cp -r gs://volume_ai_1yr_daily_data/* /home/jupyter/lstm/

In [176]:
# !pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: captum
Successfully installed captum-0.7.0


## Install Package

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from plotnine import *
from plotnine.data import *
import random
from sklearn.utils import resample
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR, OneCycleLR
from captum.attr import IntegratedGradients
from plotnine.data import *
import seaborn as sns
import gc

## Read Data

### Behavior Event data

In [6]:
fea_data = pd.concat([pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_ec_data_aggregate_1yr.csv')
                , pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_feature__data_aggregate_1yr.csv')
                , pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_popup__data_aggregate_1yr.csv')
                ])
fea_data['expired_date'] = fea_data['expired_date'].str[:10]
fea_data.shape

(43407210, 6)

In [3]:
fea_data_202401 = fea_data[fea_data['expired_date']=='2024-01-31']
feature_table_202401 = pd.pivot_table(fea_data_202401, values=['event_'], index=['hashed_sn', 'expired_date', 'event_day_diff'], columns=['event'], aggfunc="max", fill_value=0)

In [4]:
fea_data_202402 = fea_data[fea_data['expired_date']=='2024-02-29']
feature_table_202402 = pd.pivot_table(fea_data_202402, values=['event_'], index=['hashed_sn', 'expired_date', 'event_day_diff'], columns=['event'], aggfunc="max", fill_value=0)

In [5]:
fea_data_202403 = fea_data[fea_data['expired_date']=='2024-03-31']
feature_table_202403 = pd.pivot_table(fea_data_202403, values=['event_'], index=['hashed_sn', 'expired_date', 'event_day_diff'], columns=['event'], aggfunc="max", fill_value=0)

In [7]:
feature_table_202401 = feature_table_202401.reset_index()
feature_table_202402 = feature_table_202402.reset_index()
feature_table_202403 = feature_table_202403.reset_index()

In [8]:
feature_table_202401.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in feature_table_202401.columns]
feature_table_202402.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in feature_table_202402.columns]
feature_table_202403.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in feature_table_202403.columns]

In [9]:
event_day_diff = pd.DataFrame({'event_day_diff': list(range(0, 391, 1))})
sn_202401 = feature_table_202401[['hashed_sn', 'expired_date']].drop_duplicates().reset_index(drop=True)
sn_202401 = sn_202401.merge(event_day_diff, how='cross')

sn_202402 = feature_table_202402[['hashed_sn', 'expired_date']].drop_duplicates().reset_index(drop=True)
sn_202402 = sn_202402.merge(event_day_diff, how='cross')

sn_202403 = feature_table_202403[['hashed_sn', 'expired_date']].drop_duplicates().reset_index(drop=True)
sn_202403 = sn_202403.merge(event_day_diff, how='cross')

In [10]:
feature_table_202401[feature_table_202401.isna().any(axis=1)]
feature_table_202402[feature_table_202402.isna().any(axis=1)]
feature_table_202403[feature_table_202403.isna().any(axis=1)]

Unnamed: 0,hashed_sn,expired_date,event_day_diff,event_DP_introduction-end,event_DP_introduction-start,event_DTP_introduction-end,event_DTP_introduction-start,event_FS_introduction-end,event_FS_introduction-start,event_OpenMonthlyReportPopup,...,event_privacy_scanner_tb,event_pt_scan,event_quickScan,event_schedule_scan,event_secure_erase,event_stopRegularScan,event_updateManually,event_wifi_check,event_wtp,event_wtp_tb


In [13]:
feature_table_202401 = sn_202401.merge(feature_table_202401, how='left', on=['hashed_sn', 'expired_date', 'event_day_diff']).fillna(0)
feature_table_202401.to_csv('/home/jupyter/lstm/temp_table/df_feature_202401.csv', index=False)

In [None]:
gc.collect()

In [15]:
feature_table_202402 = sn_202402.merge(feature_table_202402, how='left', on=['hashed_sn', 'expired_date', 'event_day_diff']).fillna(0)
feature_table_202402.to_csv('/home/jupyter/lstm/temp_table/df_feature_202402.csv', index=False)

0

In [None]:
gc.collect()

In [11]:
feature_table_202403 = sn_202403.merge(feature_table_202403, how='left', on=['hashed_sn', 'expired_date', 'event_day_diff']).fillna(0)
feature_table_202403.to_csv('/home/jupyter/lstm/temp_table/df_feature_202403.csv', index=False)

In [None]:
gc.collect()

### Behavior Feature Selection

In [3]:
df_feature_202401 = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_202401.csv')

In [11]:
df_feature_202402 = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_202402.csv')

In [25]:
df_feature_202403 = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_202403.csv')

In [13]:
drop_columns_fea = ['event_PP_Detection_', 'event_PP_Feature_', 'event_PP_Unknown', ] #'event_pt_scan_dp'

In [14]:
df_feature_202401 = df_feature_202401.drop(columns=drop_columns_fea)
df_feature_202402 = df_feature_202402.drop(columns=drop_columns_fea)
df_feature_202403 = df_feature_202403.drop(columns=drop_columns_fea)

In [15]:
df_feature_202401[df_feature_202401.isna().any(axis=1)]
df_feature_202402[df_feature_202402.isna().any(axis=1)]
df_feature_202403[df_feature_202403.isna().any(axis=1)]

Unnamed: 0,hashed_sn,expired_date,event_day_diff,event_DP_introduction-end,event_DP_introduction-start,event_DTP_introduction-end,event_DTP_introduction-start,event_FS_introduction-end,event_FS_introduction-start,event_OpenMonthlyReportPopup,...,event_pt_scan,event_pt_scan_dp,event_quickScan,event_schedule_scan,event_secure_erase,event_stopRegularScan,event_updateManually,event_wifi_check,event_wtp,event_wtp_tb


In [16]:
df_feature_202401 = feature_table_202401[feature_table_202401['event_day_diff']!=390]
df_feature_202402 = df_feature_202402[df_feature_202402['event_day_diff']!=390]
df_feature_202403 = df_feature_202403[df_feature_202403['event_day_diff']!=390]

In [11]:
feature_table_202401['event_day_diff'] = 390 - feature_table_202401['event_day_diff']
df_feature_202402['event_day_diff'] = 390 - df_feature_202402['event_day_diff']
# df_feature_202403['event_day_diff'] = 390 - df_feature_202403['event_day_diff']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
df_feature_202401.to_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202401.csv')
df_feature_202402.to_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202402.csv')
df_feature_202403.to_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202403.csv')

### Aggregate days (Import Data monthly from here)

In [62]:
df_202401_feature_selected = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202401.csv')

In [2]:
df_202402_feature_selected = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202402.csv')

In [11]:
df_202403_feature_selected = pd.read_csv('/home/jupyter/lstm/temp_table/df_feature_selected_202403.csv')

In [64]:
df_feature_selected_202401 = df_202401_feature_selected.drop(columns=['Unnamed: 0'], axis=1)
df_feature_selected_202402 = df_202402_feature_selected.drop(columns=['Unnamed: 0'], axis=1)
df_feature_selected_202403 = df_202403_feature_selected.drop(columns=['Unnamed: 0'], axis=1)

In [14]:
df_feature_selected_202401 = df_feature_selected_202401.sort_values(by=['hashed_sn', 'event_day_diff'])
df_feature_selected_202402 = df_feature_selected_202402.sort_values(by=['hashed_sn', 'event_day_diff'])
df_feature_selected_202403 = df_feature_selected_202403.sort_values(by=['hashed_sn', 'event_day_diff'])

In [15]:
df_feature_selected_202401 = df_feature_selected_202401.reset_index(drop=True)
df_feature_selected_202402 = df_feature_selected_202402.reset_index(drop=True)
df_feature_selected_202403 = df_feature_selected_202403.reset_index(drop=True)

In [6]:
def training_length(data, max_day, min_day):
    c1 = data['event_day_diff']<=max_day
    c2 = data['event_day_diff']>min_day
    data = data[c1 & c2]
    return data

In [16]:
training_df_202401 = training_length(df_feature_selected_202401, 180, 30) # first 6 months: (240, 30); # first 4 months: (180, 30)
training_df_202402 = training_length(df_feature_selected_202402, 180, 30)
training_df_202403 = training_length(df_feature_selected_202403, 180, 30)

In [17]:
training_df_202401 = training_df_202401.reset_index(drop=True)
training_df_202402 = training_df_202402.reset_index(drop=True)
training_df_202403 = training_df_202403.reset_index(drop=True)

In [18]:
training_df_202401.to_csv('/home/jupyter/lstm/temp_table/df_training_202401_first4mon.csv', index=False)
training_df_202402.to_csv('/home/jupyter/lstm/temp_table/df_training_202402_first4mon.csv', index=False)
training_df_202403.to_csv('/home/jupyter/lstm/temp_table/df_training_202403_first4mon.csv', index=False)

In [19]:
training_df_202401 = pd.read_csv('/home/jupyter/lstm/temp_table/df_training_202401_first4mon.csv')
training_df_202402 = pd.read_csv('/home/jupyter/lstm/temp_table/df_training_202402_first4mon.csv')
training_df_202403 = pd.read_csv('/home/jupyter/lstm/temp_table/df_training_202403_first4mon.csv')

In [20]:
def aggregate_data(data, window_size):
    basic_data = data.iloc[:, :1].groupby([np.floor(data.index / window_size), 'hashed_sn']).max() # pd.concat([data.iloc[:, :1], data.iloc[:, 72:]], axis=1)
    aggregated_data = pd.concat([data.iloc[:, :1], data.iloc[:, 3:72]], axis=1).groupby([np.floor(data.index / window_size), 'hashed_sn']).sum()
    aggregated_data = pd.concat([basic_data, aggregated_data], axis=1)
    return aggregated_data.reset_index()

In [25]:
# Aggregate 30 days as an unit
data_30d_202401 = aggregate_data(training_df_202401, 30)
data_30d_202402 = aggregate_data(training_df_202402, 30)
data_30d_202403 = aggregate_data(training_df_202403, 30)

In [27]:
data_30d = pd.concat([data_30d_202401, data_30d_202402, data_30d_202403])

In [28]:
data_30d.to_csv('/home/jupyter/lstm/temp_table/df_fea_30day_fisrt4mon.csv', index=False)

### Account or Boolean Event data

In [37]:
# data_30d = pd.read_csv('/home/jupyter/lstm/temp_table/df_fea_30day_fisrt4mon.csv')

In [29]:
account = pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_account_data_1yr_202401.csv')
account['expiry_date'] = '2024-01-31'
account_temp = pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_account_data_1yr_202402.csv')
account_temp['expiry_date'] = '2024-02-29'
account = pd.concat([account, account_temp])
account_temp = pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_account_data_1yr_202403.csv')
account_temp['expiry_date'] = '2024-03-31'
account = pd.concat([account, account_temp])
account = pd.concat([account, account_temp])
print(account.shape)
print(account.groupby('expiry_date')[['hashed_sn']].nunique())

(87497, 22)
             hashed_sn
expiry_date           
2024-01-31       22019
2024-02-29       19680
2024-03-31       22899


In [30]:
boolfea_data = pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_feature__data_aggregate_1yr_booldata.csv')

In [31]:
boolfea_data = boolfea_data.drop_duplicates()

In [32]:
boolfea_data['expired_date'] = boolfea_data['expired_date'].str[:10]
boolfea_data = boolfea_data[boolfea_data['event_']==1][['hashed_sn', 'expired_date', 'event', 'event_']]

In [33]:
boolfea_data = pd.pivot_table(boolfea_data, values=['event_'], index=['hashed_sn', 'expired_date'], columns=['event'], aggfunc="max", fill_value=0)

In [34]:
boolfea_data.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in boolfea_data.columns]

In [35]:
boolfea_data = boolfea_data.reset_index()

In [36]:
boolfea_data

Unnamed: 0,hashed_sn,expired_date,event_add_file_exception,event_add_wifi_exception,event_competitor,event_dtp,event_folder_shield,event_parental_control,event_toolbar_enable
0,0002f1efd1f973f912b77be29e29b3f7411426ee,2024-02-29,0,0,0,0,0,0,1
1,000355f645e165689b9d17a2b812f055f61f4240,2024-03-31,1,0,0,0,0,0,1
2,000571256483bf5b632239e98969c09d8346913b,2024-02-29,0,0,0,0,0,0,1
3,0005fc0eb9649e4a1742ada4c1e0f4f39c7b8c1b,2024-02-29,0,0,0,1,0,0,1
4,00061926d583abffff0fb2e0da5f40d8b3629cfd,2024-02-29,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
64713,fff4ffa932f99594ab650ab15d317d3676eff397,2024-01-31,0,0,0,1,0,0,1
64714,fff5b1eb180c0874a16d471910d439a90885d016,2024-01-31,1,0,0,0,0,0,1
64715,fff70967b7b7be8676fc77e3b8f699e848613eb4,2024-03-31,0,0,0,0,1,0,1
64716,fff98551ef421f27c68ea688806ceca00a792797,2024-03-31,0,0,0,0,0,0,1


In [37]:
account_bool = account.merge(boolfea_data, how='left', left_on=['hashed_sn', 'expiry_date'], right_on=['hashed_sn', 'expired_date'])
account_bool

Unnamed: 0,consumer_account_id,hashed_sn,license_subscription_days,license_seat_used_number,account_country,account_region,license_is_auto_renew,acct_sn_online,acct_sn_offline,license_previous_payment_method,...,license_is_pss,expiry_date,expired_date,event_add_file_exception,event_add_wifi_exception,event_competitor,event_dtp,event_folder_shield,event_parental_control,event_toolbar_enable
0,11578291,8fbf2a84eebf1eed007f15fcda2ab89d3a75851a,366,2,Global Japan,Japan,0,0,1,gNULL,...,0,2024-01-31,2024-01-31,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,11511083,b3e7d942031c0c2b40c48ed6a2eedc42e1980581,365,2,japan,japan,0,1,0,1,...,1,2024-01-31,2024-01-31,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1149464,c658e7a0bab95462d194042ddadb4c475b003c2e,365,3,japan,japan,0,1,0,1,...,1,2024-01-31,2024-01-31,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1147268,241f43dfb822fdbcbee47e67157cc8285275a782,365,3,japan,japan,0,0,1,gNULL,...,0,2024-01-31,2024-01-31,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1147268,d81872a6854c4f4bf217fa3e5744f973a71e2298,365,1,japan,japan,0,0,1,gNULL,...,0,2024-01-31,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87492,25129204,07747de957ebcfbae8015dac75d4636e44940235,365,3,japan,japan,0,1,0,1,...,0,2024-03-31,2024-03-31,1.0,0.0,1.0,0.0,0.0,0.0,1.0
87493,67864375,981a0c2fe5e2b3df20f22ce697912146ae4996cf,365,3,japan,japan,0,1,0,gNULL,...,0,2024-03-31,2024-03-31,1.0,0.0,0.0,0.0,0.0,0.0,1.0
87494,48657633,2af8bdf7040e832615cfe5ca2142138163e8938a,365,3,japan,japan,0,1,0,2,...,0,2024-03-31,2024-03-31,1.0,0.0,0.0,0.0,0.0,0.0,1.0
87495,67819484,4aafe4d0f104125b1f10707009322c09c4a5a5d1,365,3,japan,japan,0,1,0,gNULL,...,0,2024-03-31,2024-03-31,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [38]:
user_list = pd.read_csv('/home/jupyter/lstm/data/VolumeAI_nonactive_sn_list_1yr.csv') #.merge(data, left_on=['hashed_sn', 'expiry_date'], right_on=['hashed_sn', 'expired_date'], how='left')
user_list['expiry_date'] = user_list['expiry_date'].str[:10]
expiry_date_list = ['2024-01-31', '2024-02-29', '2024-03-31']
user_list = user_list[user_list['expiry_date'].isin(expiry_date_list)]
print(user_list.shape)
print(user_list.groupby('expiry_date')[['hashed_sn']].nunique())

(64599, 14)
             hashed_sn
expiry_date           
2024-01-31       22020
2024-02-29       19680
2024-03-31       22899


In [39]:
user_df = user_list.merge(account_bool, how='left', on=['hashed_sn', 'expiry_date', 'consumer_account_id'])

In [40]:
user_df = user_df.iloc[:, 1:].drop_duplicates()

In [43]:
# remove no age user (all: 6%, 202401: 6%, 202402: 7%, 202403: 6%)
user_df = user_df[user_df['age'].notna()]

In [45]:
drop_columns_user = ['non_active1', 'non_active2', 'non_active3', 'non_active4', 'non_active5', 'non_active6', 'last_active_date', 'license_seat_used_number_y',
                     'license_renew_count_y', 'expired_date', 'account_country', 'account_region', 'license_pss', 'license_previous_payment_method']
user_df = user_df.drop(columns=drop_columns_user)

In [47]:
user_df[user_df.isna().any(axis=1)]

Unnamed: 0,hashed_sn,expiry_date,license_seat_number,license_seat_used_number_x,subscription_year,license_renew_count_x,license_subscription_days,license_is_auto_renew,acct_sn_online,acct_sn_offline,acct_pay_credit,acct_pay_offcv,acct_pay_banktransfer,acct_pay_webcvs,acct_pay_renewpkg,age,acct_year_with_tm,license_is_trial_convert,license_is_pss,event_add_file_exception,event_add_wifi_exception,event_competitor,event_dtp,event_folder_shield,event_parental_control,event_toolbar_enable
15,cf82125810e403681756d20e86931ab47cc62aad,2024-01-31,3,1,1,1,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,35.0,4.0,0.0,1.0,,,,,,,
16,bff5990e40e3497f74f8b9f14fab2c65a35e70a4,2024-01-31,3,2,1,1,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,50.0,8.0,0.0,0.0,,,,,,,
24,3e571220bfe84ca67a01d71934437a33a58c4fae,2024-01-31,3,3,1,18,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,49.0,17.0,0.0,0.0,,,,,,,
28,0e21a73d61009866004a357acd33e9fd9fee56a4,2024-01-31,3,2,1,1,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,40.0,1.0,0.0,0.0,,,,,,,
34,2096aa94fa4a2dfc7995c7b0e81d5a8e70d7d729,2024-01-31,3,1,1,1,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,44.0,1.0,1.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87482,befded3a67585d0c1425621b54a00ab687581485,2024-03-31,3,1,1,0,365.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,,,,,,,
87484,d3183bcd3a066d6da7237c73eead9d1ef62d28c8,2024-03-31,3,3,1,4,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,39.0,13.0,0.0,1.0,,,,,,,
87486,abb37601aa132bab670459e2af6d8b41d8f0e0a0,2024-03-31,3,1,1,5,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,40.0,8.0,0.0,0.0,,,,,,,
87490,38d4c16cc44db61ebcbd09dd73da85bbdf7d219b,2024-03-31,3,1,1,1,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,22.0,1.0,0.0,1.0,,,,,,,


In [48]:
user_df.iloc[:, -7:] = user_df.iloc[:, -7:].fillna(0)

In [49]:
user_df.shape

(60666, 26)

### Remove duplicate data

In [50]:
sn_202401 = set(data_30d_202401['hashed_sn'].unique())
sn_202403 = set(data_30d_202403['hashed_sn'].unique())
remove_sn = sn_202401 & sn_202403
remove_sn

{'9a38e29e1c68b672d188f5ea9e75c468be714e75',
 'abc6554978d8dbb7e819d63ab2a7fed29cb8b500',
 'e9e62750bd99b1cbfdb4c0f0b55f81c7480d616b'}

In [51]:
data_30d = data_30d[~data_30d['hashed_sn'].isin(remove_sn)]
user_df = user_df[~user_df['hashed_sn'].isin(remove_sn)]

In [52]:
user_df_sn = set(user_df['hashed_sn'].unique())
data_30d_sn = set(data_30d['hashed_sn'].unique())
sn_set = user_df_sn & data_30d_sn

In [53]:
user_df = user_df[user_df['hashed_sn'].isin(sn_set)].reset_index(drop=True)
data_30d = data_30d[data_30d['hashed_sn'].isin(sn_set)].reset_index(drop=True)

In [54]:
data_30d['hashed_sn'].nunique()

55854

In [55]:
user_df['hashed_sn'].nunique()

55854

In [56]:
data_30d = data_30d.reset_index(drop=True)

In [57]:
data_30d_y = data_30d[['event_eclog']]
data_30d = data_30d.drop(columns=['event_eclog', 'level_0'])
data_30d = pd.concat([data_30d, data_30d_y], axis=1)

In [58]:
data_30d[data_30d.isna().any(axis=1)]

Unnamed: 0,hashed_sn,event_DP_introduction-end,event_DP_introduction-start,event_DTP_introduction-end,event_DTP_introduction-start,event_FS_introduction-end,event_FS_introduction-start,event_OpenMonthlyReportPopup,event_PC_introduction-end,event_PC_introduction-start,event_PP_Detection,event_PP_Detection_1,event_PP_Detection_2,event_PP_Detection_3,event_PP_Detection_4,event_PP_Feature,event_PP_Feature_1,event_PP_Feature_2,event_PP_Feature_3,event_PP_Feature_4,event_PP_Info,event_PP_Info_1,event_PP_Info_2,event_PP_Info_3,event_PP_Info_4,event_PP_Product,event_PP_Product_1,event_PP_Product_2,event_PP_Product_3,event_PP_Product_4,event_RTS_ML,event_SE_introduction-end,event_SE_introduction-start,event_VT_introduction-end,event_VT_introduction-start,event_access_security_report,event_ad_block_tb,event_add_mute_mode_program,event_air_support,event_anti_exploit,event_anti_spam,event_bns,event_browser_exploit,event_checkDetailReport,event_checkDetailReportMakeFurtherImprovement,event_customizedScan,event_download_scan,event_dr_scan,event_emailreport,event_firewall_booster,event_folder_shield_th,event_fraud_buster,event_fraud_buster_tb,event_fullScan,event_mail_scan,event_manual_scan,event_openMainConsole,event_pay_guard,event_pay_guard_tb,event_privacy_scanner_tb,event_pt_scan,event_quickScan,event_schedule_scan,event_secure_erase,event_stopRegularScan,event_updateManually,event_wifi_check,event_wtp,event_wtp_tb,event_eclog


In [59]:
data_30d = pd.concat([data_30d.iloc[:, :1], data_30d.iloc[:, 1:].astype(int)], axis=1)

In [60]:
data_30d.to_csv('/home/jupyter/lstm/temp_table/df_fea_30day_first4mon.csv', index=False)

### Standardization, Min-Max Normalization

In [61]:
# Min-Max Scaler
def min_max(data):
    scaler = MinMaxScaler()
    data_fea = data.iloc[:, 1:].astype(int)
    data_fea = pd.DataFrame(scaler.fit_transform(data_fea), columns=data_fea.columns)
    data = pd.concat([data.iloc[:, :1], data_fea], axis=1)
    return data

In [62]:
data_30d_minmaxscaler = min_max(data_30d)

In [63]:
data_30d_y = data_30d_y.reset_index(drop=True)
data_30d_minmaxscaler = pd.concat([data_30d_minmaxscaler, data_30d_y], axis=1)
data_30d_minmaxscaler

Unnamed: 0,hashed_sn,event_DP_introduction-end,event_DP_introduction-start,event_DTP_introduction-end,event_DTP_introduction-start,event_FS_introduction-end,event_FS_introduction-start,event_OpenMonthlyReportPopup,event_PC_introduction-end,event_PC_introduction-start,event_PP_Detection,event_PP_Detection_1,event_PP_Detection_2,event_PP_Detection_3,event_PP_Detection_4,event_PP_Feature,event_PP_Feature_1,event_PP_Feature_2,event_PP_Feature_3,event_PP_Feature_4,event_PP_Info,event_PP_Info_1,event_PP_Info_2,event_PP_Info_3,event_PP_Info_4,event_PP_Product,event_PP_Product_1,event_PP_Product_2,event_PP_Product_3,event_PP_Product_4,event_RTS_ML,event_SE_introduction-end,event_SE_introduction-start,event_VT_introduction-end,event_VT_introduction-start,event_access_security_report,event_ad_block_tb,event_add_mute_mode_program,event_air_support,event_anti_exploit,event_anti_spam,event_bns,event_browser_exploit,event_checkDetailReport,event_checkDetailReportMakeFurtherImprovement,event_customizedScan,event_download_scan,event_dr_scan,event_emailreport,event_firewall_booster,event_folder_shield_th,event_fraud_buster,event_fraud_buster_tb,event_fullScan,event_mail_scan,event_manual_scan,event_openMainConsole,event_pay_guard,event_pay_guard_tb,event_privacy_scanner_tb,event_pt_scan,event_quickScan,event_schedule_scan,event_secure_erase,event_stopRegularScan,event_updateManually,event_wifi_check,event_wtp,event_wtp_tb,event_eclog,event_eclog.1
0,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200000,0.000000,0.0,0.0,0.000000,0.233333,0.033333,0.0,0.0,0.066667,0.066667,0.000000,0.0,0.000000,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.233333,7.0
1,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.000000,0.0,0.0,0.000000,0.200000,0.033333,0.0,0.0,0.000000,0.033333,0.033333,0.0,0.000000,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.033333,0.0,0.000603,0.0,0.366667,11.0
2,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233333,0.000000,0.0,0.0,0.000000,0.233333,0.066667,0.0,0.0,0.033333,0.066667,0.000000,0.0,0.000000,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200000,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000038,0.0,0.400000,12.0
3,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233333,0.033333,0.0,0.0,0.033333,0.233333,0.033333,0.0,0.0,0.033333,0.066667,0.033333,0.0,0.033333,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.033333,0.0,0.000000,0.0,0.300000,9.0
4,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266667,0.000000,0.0,0.0,0.033333,0.333333,0.166667,0.0,0.0,0.033333,0.166667,0.033333,0.0,0.000000,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.033333,0.0,0.000000,0.0,0.633333,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279265,ffff88e9877ad0e66a46dd2f2396057172b20856,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.833333,25.0
279266,ffff88e9877ad0e66a46dd2f2396057172b20856,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.666667,20.0
279267,ffff88e9877ad0e66a46dd2f2396057172b20856,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.833333,25.0
279268,ffff88e9877ad0e66a46dd2f2396057172b20856,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.833333,25.0


In [64]:
data_30d_minmaxscaler.shape

(279270, 71)

In [65]:
data_30d_minmaxscaler['hashed_sn'].nunique()

55854

In [66]:
len(data_30d_minmaxscaler[data_30d_minmaxscaler.iloc[:, -1]>0])/len(data_30d_minmaxscaler)

0.8463279263794894

In [67]:
# first 4 months
(data_30d_minmaxscaler.shape[0])/data_30d_minmaxscaler['hashed_sn'].nunique()

5.0

### Create X, y dataset

In [68]:
def create_dataset(data, look_back):
    X, y = [], []
    user_ids = data['hashed_sn'].unique()
    for user_id in user_ids:
        user_data = data[data['hashed_sn'] == user_id]        
        for i in range(len(user_data) - look_back): # len(user_data) - 1
            X.append(user_data.iloc[i:i+look_back, 1:-1].values) # [i:i + look_back, :-1]
            y.append(int(user_data.iloc[i+look_back, -1] > 0)) # [i + look_back, -1]
    return np.array(X), np.array(y)

In [69]:
X_30d_minmaxscaler, y_30d_minmaxscaler = create_dataset(data_30d_minmaxscaler, 4) # first 6 months: 6; first 4 months: 4

In [72]:
y_30d_minmaxscaler.sum()/len(y_30d_minmaxscaler)

0.8387939986393096

In [73]:
X_30d_minmaxscaler.shape, y_30d_minmaxscaler.shape

((55854, 4, 69), (55854,))

In [75]:
np.save('/home/jupyter/lstm/training_array/X_30d_minmaxscaler_feaonly_first4mon_202401_03.npy', X_30d_minmaxscaler)
np.save('/home/jupyter/lstm/training_array/y_30d_minmaxscaler_feaonly_first4mon_202401_03.npy', y_30d_minmaxscaler)

#### Bool Event

In [76]:
sn_order = list(data_30d.hashed_sn.unique())
sn_order

['0008b2604e621b61740b50255a6b0eba0c1e139e',
 '001159d7bda5935cead291f3a8f97fa7d9155f90',
 '00162cd75cac32f350b0c42aff2f6d1a9910e059',
 '001631cda48fb0f2c692109cee3c3f045cfe6ab3',
 '001758d7a32b7d42f9fbb0d6cf0cdd35277b9d79',
 '001c641fe43956f7ca2ba3772ae09cb47f1a0ec5',
 '002007b281b4c07fa67c54da3b8a99123a0aa1e3',
 '0020813b072aadc6e5bc74c070df907eb6300e5e',
 '002087d8be96fdc1329f53c8051b5c6937d3fc6a',
 '002381dbf494a3fe8c54fbcef33c1f11cbde4714',
 '0025fa5b17543070f848d4682ba2dbca4625314f',
 '00295897b454b81f9442eae17e45c20e195aa92e',
 '002a892c2470acbbe7c660fe757a496ddb5d2552',
 '002ab48115a9133e260f0175745bc6dda958a3ca',
 '00302ef4173e34f1cc87b44231cfa84bcc723548',
 '0030c0b4a8fe591c5f616f3fe1da65a82756382a',
 '0039f91ec6d667d3aab7decfc094667f81549fe0',
 '003ad52655bce319260d7f94f820c28f2cfbb1c0',
 '003c58c5bb5fb4e4046b948012febe5f2e6f3171',
 '003c6f50ed89c54725261b17b9ef3c046ea93f1f',
 '003e710a65587734491e3ea43e5921ec5649455f',
 '003f5173498e043f9f70f6301a008ede1e332805',
 '0046a94d

In [77]:
user_df['hashed_sn'] = pd.Categorical(user_df['hashed_sn'], sn_order)
user_df = user_df.sort_values(by='hashed_sn')
user_df

Unnamed: 0,hashed_sn,expiry_date,license_seat_number,license_seat_used_number_x,subscription_year,license_renew_count_x,license_subscription_days,license_is_auto_renew,acct_sn_online,acct_sn_offline,acct_pay_credit,acct_pay_offcv,acct_pay_banktransfer,acct_pay_webcvs,acct_pay_renewpkg,age,acct_year_with_tm,license_is_trial_convert,license_is_pss,event_add_file_exception,event_add_wifi_exception,event_competitor,event_dtp,event_folder_shield,event_parental_control,event_toolbar_enable
10380,0008b2604e621b61740b50255a6b0eba0c1e139e,2024-01-31,3,3,1,15,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,56.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1472,001159d7bda5935cead291f3a8f97fa7d9155f90,2024-01-31,3,2,1,0,365.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,53.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14617,00162cd75cac32f350b0c42aff2f6d1a9910e059,2024-01-31,3,1,1,0,365.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,80.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
14442,001631cda48fb0f2c692109cee3c3f045cfe6ab3,2024-01-31,3,2,1,7,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,77.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
16716,001758d7a32b7d42f9fbb0d6cf0cdd35277b9d79,2024-01-31,3,2,1,0,365.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37991,fff2de57211e981ffbcfce7fa210ef12ed3b6eb6,2024-03-31,3,1,1,4,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,42.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
45574,fff395e8759ca55f896f0934e7dd9493feb536d4,2024-03-31,3,3,1,16,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,53.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36290,fff70967b7b7be8676fc77e3b8f699e848613eb4,2024-03-31,3,1,1,1,365.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,51.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
36699,fff98551ef421f27c68ea688806ceca00a792797,2024-03-31,3,3,1,2,365.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,55.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [78]:
user_df = user_df.reset_index(drop=True)

In [83]:
len(np.array(data_30d.iloc[:, -1]))

279270

In [84]:
ec_array = np.array(data_30d.iloc[:, -1]).reshape(user_df.shape[0], 5) # first 4 months: 4 = 5 - 1
ec_array

array([[ 7, 11, 12,  9, 19],
       [24, 24, 23, 24, 26],
       [ 6, 11,  6,  8, 16],
       ...,
       [ 3,  6,  9,  8,  4],
       [28, 30, 29, 30, 28],
       [25, 20, 25, 25, 24]])

In [87]:
ec_array[:, :4]

array([[ 7, 11, 12,  9],
       [24, 24, 23, 24],
       [ 6, 11,  6,  8],
       ...,
       [ 3,  6,  9,  8],
       [28, 30, 29, 30],
       [25, 20, 25, 25]])

In [88]:
first_x_month = 4
sum_array = np.sum(ec_array[:, :first_x_month], axis=1).reshape(user_df.shape[0], 1)
avg_array = np.mean(ec_array[:, :first_x_month], axis=1).reshape(user_df.shape[0], 1)
max_array = np.max(ec_array[:, :first_x_month], axis=1).reshape(user_df.shape[0], 1)
min_array = np.min(ec_array[:, :first_x_month], axis=1).reshape(user_df.shape[0], 1)
std_array = np.std(ec_array[:, :first_x_month], axis=1).reshape(user_df.shape[0], 1)

In [89]:
ec_df = pd.DataFrame(np.array((sum_array, avg_array, max_array, min_array, std_array)).squeeze().T, columns=['ec_sum', 'ec_avg', 'ec_max', 'ec_min', 'ec_std'])
ec_df

Unnamed: 0,ec_sum,ec_avg,ec_max,ec_min,ec_std
0,39.0,9.75,12.0,7.0,1.920286
1,95.0,23.75,24.0,23.0,0.433013
2,31.0,7.75,11.0,6.0,2.046338
3,29.0,7.25,8.0,7.0,0.433013
4,9.0,2.25,5.0,0.0,2.277608
...,...,...,...,...,...
55849,69.0,17.25,24.0,10.0,5.889609
55850,118.0,29.50,30.0,29.0,0.500000
55851,26.0,6.50,9.0,3.0,2.291288
55852,117.0,29.25,30.0,28.0,0.829156


In [90]:
user_df = pd.concat([user_df, ec_df], axis=1)

In [91]:
user_df = user_df.drop(columns=['expiry_date'], axis=1)

In [None]:
user_df.to_csv('/home/jupyter/lstm/temp_table/df_bool_30day_first4mon_orig.csv', drop=True)

In [92]:
user_df_minmaxscaler = min_max(user_df)

In [94]:
user_df_minmaxscaler

Unnamed: 0,hashed_sn,license_seat_number,license_seat_used_number_x,subscription_year,license_renew_count_x,license_subscription_days,license_is_auto_renew,acct_sn_online,acct_sn_offline,acct_pay_credit,acct_pay_offcv,acct_pay_banktransfer,acct_pay_webcvs,acct_pay_renewpkg,age,acct_year_with_tm,license_is_trial_convert,license_is_pss,event_add_file_exception,event_add_wifi_exception,event_competitor,event_dtp,event_folder_shield,event_parental_control,event_toolbar_enable,ec_sum,ec_avg,ec_max,ec_min,ec_std
0,0008b2604e621b61740b50255a6b0eba0c1e139e,0.0,0.222222,0.0,0.681818,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.827697,0.647059,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.325000,0.300000,0.400000,0.233333,0.066667
1,001159d7bda5935cead291f3a8f97fa7d9155f90,0.0,0.111111,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.825282,0.058824,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.791667,0.766667,0.800000,0.766667,0.000000
2,00162cd75cac32f350b0c42aff2f6d1a9910e059,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.847021,0.058824,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.258333,0.233333,0.366667,0.200000,0.133333
3,001631cda48fb0f2c692109cee3c3f045cfe6ab3,0.0,0.111111,0.0,0.318182,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.844605,0.529412,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.241667,0.233333,0.266667,0.233333,0.000000
4,001758d7a32b7d42f9fbb0d6cf0cdd35277b9d79,0.0,0.111111,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.822866,0.529412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.075000,0.066667,0.166667,0.000000,0.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55849,fff2de57211e981ffbcfce7fa210ef12ed3b6eb6,0.0,0.000000,0.0,0.181818,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.816425,0.647059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.575000,0.566667,0.800000,0.333333,0.333333
55850,fff395e8759ca55f896f0934e7dd9493feb536d4,0.0,0.222222,0.0,0.727273,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.825282,0.647059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.983333,0.966667,1.000000,0.966667,0.000000
55851,fff70967b7b7be8676fc77e3b8f699e848613eb4,0.0,0.000000,0.0,0.045455,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.823671,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.216667,0.200000,0.300000,0.100000,0.133333
55852,fff98551ef421f27c68ea688806ceca00a792797,0.0,0.222222,0.0,0.090909,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.826892,0.647059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.975000,0.966667,1.000000,0.933333,0.000000


In [96]:
user_df_minmaxscaler = np.array(user_df_minmaxscaler.iloc[:, 1:])
user_df_minmaxscaler.shape

(55854, 29)

In [97]:
np.save('/home/jupyter/lstm/training_array/static_array_first4mon_202401_03.npy', user_df_minmaxscaler)

### (Import X, y Data)

In [5]:
X_30d_minmaxscaler = np.load('/home/jupyter/lstm/training_array/X_30d_minmaxscaler_feaonly_first4mon_202401_03.npy')
y_30d_minmaxscaler = np.load('/home/jupyter/lstm/training_array/y_30d_minmaxscaler_feaonly_first4mon_202401_03.npy')

In [28]:
user_df_minmaxscaler = np.load('/home/jupyter/lstm/training_array/static_array_first4mon_202401_03.npy')

## Split Train-Test data

In [6]:
X_train_all, X_test, y_train_all, y_test, fea_train_all, fea_test = train_test_split(X_30d_minmaxscaler, y_30d_minmaxscaler, user_df_minmaxscaler, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val, fea_train, fea_val = train_test_split(X_train_all, y_train_all, fea_train_all, test_size=0.1, random_state=42)

In [7]:
y_train.sum()/len(y_train)

0.8392102252946735

In [8]:
y_test.sum()/len(y_test)

0.8376152537821144

In [9]:
y_val.sum()/len(y_val)

0.8379950771984784

In [10]:
X_train.shape

(40214, 4, 69)

In [11]:
fea_train.shape

(40214, 29)

In [12]:
y_train.shape

(40214,)

In [13]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().unsqueeze(-1)
fea_train = torch.from_numpy(fea_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().unsqueeze(-1)
fea_test = torch.from_numpy(fea_test).float()
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).float().unsqueeze(-1)
fea_val = torch.from_numpy(fea_val).float()