In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_states_regions = pd.read_csv("./examples/components/CreditCardFraud/preprocessing/us_regions.csv")
states_regions = {row.StateCode: row.Region for row in df_states_regions.itertuples()}

df_states_regions.head()

In [None]:
print(states_regions)

In [None]:
list(df_states_regions['Region'].unique())

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

useful_props = [
    "amt",
    "age",
    # "cc_num",
    "merch_lat",
    "merch_long",
    "category",
    "region",
    "gender",
    "state",
    "zip",
    "lat",
    "long",
    "city_pop",
    "job",
    # "dob",
    "trans_date_trans_time",
    "is_fraud",
]
categorical = ["category", "region", "gender", "state", "job"]
datetimes = ["trans_date_trans_time"]
normalize = ["age", "merch_lat", "merch_long", "lat", "long", "city_pop", "trans_date_trans_time", "amt"]

ENCODERS = {}
SCALERS = {}

def basic_transforms(df):
    # Just so we are always aware of all available columns
    print(df.columns)

    # Filter only useful columns
    df.loc[:, 'age'] = (pd.Timestamp.now() - pd.to_datetime(df['dob'])) // pd.Timedelta('1y')

    df = df[useful_props]
    for column in categorical:
        if column not in ENCODERS:
            print(f"Creating encoder for column: {column}")
            # Simply set all zeros if the category is unseen
            encoder = OneHotEncoder(handle_unknown="ignore")
            encoder.fit(df[column].values.reshape(-1,1))
            ENCODERS[column] = encoder

        encoder = ENCODERS.get(column)
        encoded_data = encoder.transform(df[column].values.reshape(-1,1)).toarray()
        encoded_df = pd.DataFrame(encoded_data, columns = [column + "_" + "_".join(x.split("_")[1:]) for x in encoder.get_feature_names()])
        encoded_df.index = df.index
        df = df.join(encoded_df).drop(column, axis=1)

    for column in datetimes:
        df.loc[:, column] = pd.to_datetime(df[column]).view('int64')
    for column in normalize:
        if column not in SCALERS:
            print(f"Creating encoder for column: {column}")
            # Simply set all zeros if the category is unseen
            scaler = StandardScaler()
            scaler.fit(df[column].values.reshape(-1,1))
            SCALERS[column] = scaler

        scaler = SCALERS.get(column)
        # df.loc[:, column] = scaler.transform(df[column].values.reshape(-1,1))
        df[column] = scaler.transform(df[column].values.reshape(-1,1))

    return df

In [None]:
df_train = pd.read_csv("fraudTrain.csv", index_col=0)
df_test = pd.read_csv("fraudTest.csv", index_col=0)

df_train.loc[:, 'region'] = df_train['state'].map(states_regions)
df_test.loc[:, 'region'] = df_test['state'].map(states_regions)

df_train = df_train[df_train["region"].str.match(".*est.*")]
df_test = df_test[df_test["region"].str.match(".*est.*")]

df_train = basic_transforms(df_train)
df_test = basic_transforms(df_test)

print(df_train.columns)
df_train.head()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 10000
net = nn.Sequential(
    nn.Linear(len(df_train.columns) - 1, 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
    nn.Sigmoid()
).to(device)


optimizer = torch.optim.Adam(net.parameters(), lr=1e-5)
criterion = nn.BCELoss()

In [None]:
class FraudDataset(torch.utils.data.Dataset):
    """FraudDataset Dataset - combination of features and labels

    Args:
        feature: Transaction detail tensors
        target: Tensor of labels corresponding to features

    Returns:
        None
    """

    def __init__(self, feature, target=None):
        self.X = feature
        self.Y = target

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.Y is None:
            return [self.X[idx]]
        return self.X[idx], self.Y[idx]

In [None]:
train_dataset = FraudDataset(torch.tensor(df_train.loc[:, df_train.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_train.loc[:, "is_fraud"].values, dtype=int))
test_dataset = FraudDataset(torch.tensor(df_test.loc[:, df_test.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_test.loc[:, "is_fraud"].values, dtype=int))

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from torchmetrics.functional import precision_recall, accuracy

epochs = 5
log_step = 10
running_loss = 0.0
running_acc = 0.0
running_rec = 0.0
running_prec = 0.0
for epoch in range(epochs):
    for phase in ['train', 'test']:
        print(phase)
        dataloader = train_dataloader if phase == 'train' else test_dataloader

        net.train() if phase == 'train'else net.eval()

        for j, batch in enumerate(dataloader):
            i = j + 1
            data, labels = batch[0].to(device), batch[1].to(device)
            
            if phase == 'train':
                optimizer.zero_grad()

            predictions = net(data)
            # accuracy = float((predictions == labels.reshape(-1,1)).detach().cpu().numpy().sum()) / labels.shape[0]
            running_acc += accuracy(predictions, labels)
            precrec = precision_recall(predictions, labels)
            running_prec += precrec[0]
            running_rec += precrec[1]

            if phase == 'train':
                cost = criterion(predictions, labels.reshape(-1,1).type(torch.float))
                cost.backward()
                optimizer.step()

                running_loss += cost.cpu().detach().numpy() / data.size()[0]
                if i != 0 and i % log_step == 0:
                    training_loss = running_loss / log_step
                    training_acc = running_acc / log_step
                    training_prec = running_prec / log_step
                    training_rec = running_rec / log_step
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)}, Phase: {phase}, Loss: {training_loss}, Accuracy: {training_acc}, Precision: {training_prec}, Recall: {training_rec}"
                    )

                    running_loss = 0.0
                    running_acc = 0.0
                    running_prec = 0.0
                    running_rec = 0.0
            else:
                if i != 0 and i % log_step == 0:
                    training_acc = running_acc / log_step
                    training_prec = running_prec / log_step
                    training_rec = running_rec / log_step
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)}, Phase: {phase}, Accuracy: {training_acc}, Precision: {training_prec}, Recall: {training_rec}"
                    )

                    running_acc = 0.0
                    running_prec = 0.0
                    running_rec = 0.0