In [1]:
import sys
sys.path.append('../')
import src.config as config
from src.paths import PREPROCESSED_DATA_DIR

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime
import hopsworks
import tqdm

In [4]:
import sys
sys.path.append('../')
from src.paths import PARENT_DIR, RAW_DATA_DIR, PREPROCESSED_DATA_DIR
from src.data import preprocess_data



In [5]:
# Load key-value pairs from the .env file into the script's environment
load_dotenv(PARENT_DIR / '.env')
HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [5]:
df = pd.read_csv(RAW_DATA_DIR / 'ibm_fraud_cc.csv')

In [6]:
drop_cols = ['card','merchant_city', 'zip']

df_trans_per_year = []

In [7]:
starting_year = 2000
ending_year = 2006
range_years = range(starting_year,ending_year)

for year in tqdm.tqdm(range_years, total=len(range_years), desc='Preprocessing Data Sets per Year'):
    
    df_trans_ = preprocess_data(df, year=year, time_delta=60, drop_cols=drop_cols)
    df_trans_per_year.append(df_trans_)

Calculating RFM features: 100%|██████████| 198751/198751 [03:28<00:00, 952.43it/s]
Calculating RFM features: 100%|██████████| 286723/286723 [06:04<00:00, 787.11it/s]
Calculating RFM features: 100%|██████████| 396974/396974 [10:05<00:00, 655.89it/s]
Calculating RFM features: 100%|██████████| 530117/530117 [47:14<00:00, 186.99it/s]
Calculating RFM features: 100%|██████████| 680503/680503 [1:17:04<00:00, 147.14it/s]
Calculating RFM features: 100%|██████████| 855575/855575 [1:13:43<00:00, 193.43it/s]
Preprocessing Data Sets per Year: 100%|██████████| 6/6 [3:44:31<00:00, 2245.29s/it]


In [8]:
df_trans_full = pd.concat(df_trans_per_year)

In [None]:
# add column with Unix epoch milliseconds
df_trans_full['full_date_unix'] = df_trans_full['full_date'].apply(lambda x: int(datetime.timestamp(x) * 1000))

In [39]:
# Dropping some duplicates that are not being removed by the preprocessing function, but they're not relevant for the analysis (no frauds)
id_cols = ['user','amount','full_date_unix']
df_trans_full = df_trans_full.drop_duplicates(subset=id_cols, keep=False)

In [6]:
project = hopsworks.login(
    project = config.HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/517268


In [7]:
feature_store = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.


In [8]:
feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Transaction data with RFM features at hourly frequency",
    primary_key=['full_date_unix','user','amount'],
    event_time='full_date_unix',
)

In [47]:
feature_group.insert(df_trans_full, write_options={"wait_for_job": False})


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/517268/fs/513091/fg/574819


Uploading Dataframe: 0.00% |          | Rows 0/2442744 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: transactions_with_rfm_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/517268/jobs/named/transactions_with_rfm_feature_group_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x27395a78730>, None)

In [50]:
df_trans_full.to_parquet(PREPROCESSED_DATA_DIR / 'full_data_2000_2005.parquet', index=False)