In [184]:
import pandas as pd
import matplotlib.pyplot as plt

In [185]:
df_states_regions = pd.read_csv("./examples/components/CreditCardFraud/preprocessing/us_regions.csv")
states_regions = {row.StateCode: row.Region for row in df_states_regions.itertuples()}

df_states_regions.head()

Unnamed: 0,State,StateCode,Region,Division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [186]:
list(df_states_regions['Region'].unique())

['West', 'South', 'Northeast', 'Midwest']

In [216]:
useful_props = ['amt','cc_num', 'merch_lat', 'merch_long', 'category', 'gender', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_date_trans_time', 'is_fraud']
categorical = ['category', 'gender', 'region', 'state', 'job']
datetimes = ['dob', 'trans_date_trans_time']
normalize = ['dob', 'age']

def basic_transforms(df):
    # Just so we are always aware of all available columns
    print(df.columns)

    # Filter only useful columns
    df = df[useful_props]

    # df["full_name"] = df["first"] + " " + df["last"]
    df.loc[:, 'region'] = df['state'].map(states_regions)
    df.loc[:, 'age'] = (pd.Timestamp.now() - pd.to_datetime(df['dob'])) // pd.Timedelta('1y')
    for column in categorical:
        df = df.join(pd.get_dummies(df[column], prefix=column)).drop(column, axis=1)

    for column in datetimes:
        df.loc[:, column] = pd.to_datetime(df[column]).view('int64')
    for column in normalize:
        df.loc[:, column] = (df[column] - df[column].min())/(df[column].max() - df[column].min())

    return df

In [217]:
df_train = pd.read_csv("fraudTrain.csv", index_col=0)
df_test = pd.read_csv("fraudTest.csv", index_col=0)

df_train = basic_transforms(df_train)
df_test = basic_transforms(df_test)

print(df_train.columns)
df_train.head()

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')
Index(['amt', 'cc_num', 'merch_lat', 'merch_long', 'zip', 'lat', 'long',
       'city_pop', 'dob', 'trans_date_trans_time',
       ...
       'job_Visual merchandiser', 'job_Volunteer coordinator',
       'job_Warden/ranger', 'job_Warehouse manager',
       'job_Waste management officer', 'job_Water engineer',
       'job_Water quality scientist', 'job_Web designer',
       'job_Wellsite geologist', 'job_Writer'],
      dtype='object', length=577)


Unnamed: 0,amt,cc_num,merch_lat,merch_long,zip,lat,long,city_pop,dob,trans_date_trans_time,...,job_Visual merchandiser,job_Volunteer coordinator,job_Warden/ranger,job_Warehouse manager,job_Waste management officer,job_Water engineer,job_Water quality scientist,job_Web designer,job_Wellsite geologist,job_Writer
0,4.97,2703186189652095,36.011293,-82.048315,28654,36.0788,-81.1781,3495,0.789499,1546300818000000000,...,0,0,0,0,0,0,0,0,0,0
1,107.23,630423337322,49.159047,-118.186462,99160,48.8878,-118.2105,149,0.668418,1546300844000000000,...,0,0,0,0,0,0,0,0,0,0
2,220.11,38859492057661,43.150704,-112.154481,83252,42.1808,-112.262,4154,0.463819,1546300851000000000,...,0,0,0,0,0,0,0,0,0,0
3,45.0,3534093764340240,47.034331,-112.561071,59632,46.2306,-112.1138,1939,0.525878,1546300876000000000,...,0,0,0,0,0,0,0,0,0,0
4,41.96,375534208663984,38.674999,-78.632459,24433,38.4207,-79.4629,99,0.765208,1546300986000000000,...,0,0,0,0,0,0,0,0,0,0


In [225]:
import torch

In [226]:
torch.utils.data.DataLoader(df_train)

<torch.utils.data.dataloader.DataLoader at 0x1bf749339a0>

In [295]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 1000
net = nn.Sequential(
    nn.Linear(len(df_train.columns) - 1, 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
    nn.Sigmoid()
).to(device)


optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
criterion = nn.BCELoss()

In [296]:
class FraudDataset(torch.utils.data.Dataset):
    """FraudDataset Dataset - combination of features and labels

    Args:
        feature: Transaction detail tensors
        target: Tensor of labels corresponding to features

    Returns:
        None
    """

    def __init__(self, feature, target=None):
        self.X = feature
        self.Y = target

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.Y is None:
            return [self.X[idx]]
        return self.X[idx], self.Y[idx]

In [297]:
train_dataset = FraudDataset(torch.tensor(df_train.loc[:, df_train.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_train.loc[:, "is_fraud"].values, dtype=torch.float))
test_dataset = FraudDataset(torch.tensor(df_test.loc[:, df_test.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_test.loc[:, "is_fraud"].values, dtype=torch.float))

In [298]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [299]:
epochs = 100
running_loss = 0.0
running_acc = 0.0
for epoch in range(100):
    for phase in ['train', 'test']:
        dataloader = train_dataloader if phase == 'train' else test_dataloader

        net.train() if phase == 'train'else net.eval()

        for i, batch in enumerate(dataloader):
            data, labels = batch[0].to(device), batch[1].to(device)

            if phase == 'train':
                optimizer.zero_grad()

            predictions = net(data)
            accuracy = float((predictions == labels.reshape(-1,1)).detach().cpu().numpy().sum()) / data.size()[0]
            running_acc += accuracy

            if phase == 'train':
                cost = criterion(predictions, labels.reshape(-1,1))
                cost.backward()
                optimizer.step()

                running_loss += cost.cpu().detach().numpy() / data.size()[0]
                if i != 0 and i % 50 == 0:
                    training_loss = running_loss / 50
                    training_acc = running_acc / 50
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)//batch_size}, Phase: {phase}, Loss: {training_loss}, Accuracy: {training_acc}"
                    )

                    running_loss = 0.0
                    running_acc = 0.0
            else:
                if i != 0 and i % 50 == 0:
                    training_acc = running_acc / 50
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)//batch_size}, Phase: {phase}, Accuracy: {training_acc}"
                    )

                    running_acc = 0.0

Epoch: 0/100, Iteration: 50/1, Phase: train, Loss: 0.0009260000346601012, Accuracy: 1.01074
Epoch: 0/100, Iteration: 100/1, Phase: train, Loss: 0.001092000042945147, Accuracy: 0.98908
Epoch: 0/100, Iteration: 150/1, Phase: train, Loss: 0.000750000025779009, Accuracy: 0.9924999999999998
Epoch: 0/100, Iteration: 200/1, Phase: train, Loss: 0.0005240000198781493, Accuracy: 0.9947599999999999
Epoch: 0/100, Iteration: 250/1, Phase: train, Loss: 0.0006160000251233578, Accuracy: 0.9938400000000002
Epoch: 0/100, Iteration: 300/1, Phase: train, Loss: 0.0006020000189542771, Accuracy: 0.9939799999999995
Epoch: 0/100, Iteration: 350/1, Phase: train, Loss: 0.0003260000059008599, Accuracy: 0.9967399999999997
Epoch: 0/100, Iteration: 400/1, Phase: train, Loss: 0.0004740000186860563, Accuracy: 0.9952599999999999
Epoch: 0/100, Iteration: 450/1, Phase: train, Loss: 0.00035400001078844076, Accuracy: 0.99646
Epoch: 0/100, Iteration: 500/1, Phase: train, Loss: 0.0004200000125169755, Accuracy: 0.9958
Epoch: 

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1000x559 and 576x512)

In [275]:
net(torch.rand((2, 576), dtype=torch.float).to(device))

tensor([[0.6469],
        [0.6470]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [272]:
data

tensor([[4.9700e+00, 2.7032e+15, 3.6011e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0723e+02, 6.3042e+11, 4.9159e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], device='cuda:0', dtype=torch.float64)

In [273]:
torch.rand((2, 576), dtype=torch.float64).to(device)

tensor([[0.1483, 0.1812, 0.1934,  ..., 0.5835, 0.4280, 0.1526],
        [0.7082, 0.6560, 0.4368,  ..., 0.4499, 0.8403, 0.8344]],
       device='cuda:0', dtype=torch.float64)