In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [29]:
df_states_regions = pd.read_csv("./examples/components/CreditCardFraud/preprocessing/us_regions.csv")
states_regions = {row.StateCode: row.Region for row in df_states_regions.itertuples()}

df_states_regions.head()

Unnamed: 0,State,StateCode,Region,Division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [30]:
list(df_states_regions['Region'].unique())

['West', 'South', 'Northeast', 'Midwest']

In [56]:
useful_props = ['amt','cc_num', 'merch_lat', 'merch_long', 'category', 'gender', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_date_trans_time', 'is_fraud']
categorical = ['category', 'gender', 'region', 'state', 'job']
datetimes = ['dob', 'trans_date_trans_time']
normalize = ['dob', 'age']

encoders = {}

def basic_transforms(df):
    # Just so we are always aware of all available columns
    print(df.columns)

    # Filter only useful columns
    df = df[useful_props]

    # df["full_name"] = df["first"] + " " + df["last"]
    df.loc[:, 'region'] = df['state'].map(states_regions)
    df.loc[:, 'age'] = (pd.Timestamp.now() - pd.to_datetime(df['dob'])) // pd.Timedelta('1y')
    for column in categorical:
        if column not in encoders:
            print(f"Creating encoder for column: {column}")
            # Simply set all zeros if the category is unseen
            encoder = OneHotEncoder(handle_unknown='ignore')
            encoder.fit(df[column].values.reshape(-1,1))
            encoders[column] = encoder

        encoder = encoders.get(column)
        encoded_data = encoder.transform(df[column].values.reshape(-1,1)).toarray()
        encoded_df = pd.DataFrame(encoded_data, columns = [column + "_" + '_'.join(x.split('_')[1:]) for x in encoder.get_feature_names()])
        encoded_df.index = df.index
        df = df.join(encoded_df).drop(column, axis=1)

    for column in datetimes:
        df.loc[:, column] = pd.to_datetime(df[column]).view('int64')
    for column in normalize:
        df.loc[:, column] = (df[column] - df[column].min())/(df[column].max() - df[column].min())

    return df

In [57]:
df_train = pd.read_csv("fraudTrain.csv", index_col=0)
df_test = pd.read_csv("fraudTest.csv", index_col=0)

df_train = basic_transforms(df_train)
df_test = basic_transforms(df_test)

print(df_train.columns)
df_train.head()

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Creating encoder for column: category
Creating encoder for column: gender
Creating encoder for column: region
Creating encoder for column: state
Creating encoder for column: job
Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Index(['amt', 'cc_num', 'merch_lat', 'merch_long', 'zip', 'lat', 'long',
       'city_pop', 'dob', 'trans_date_trans_time',
       ...
       'job_Visual merchandiser', 'job_Volunteer coordinator',
       'job_Warden/ranger', 'job_Warehouse manager',
       'job_Waste management officer', 'job_Water engineer',
       'job_Water quality scientist', 'job_Web designer',
       'job_Wellsite geologist', 'job_Writer'],
      dtype='object', length=577)


Unnamed: 0,amt,cc_num,merch_lat,merch_long,zip,lat,long,city_pop,dob,trans_date_trans_time,...,job_Visual merchandiser,job_Volunteer coordinator,job_Warden/ranger,job_Warehouse manager,job_Waste management officer,job_Water engineer,job_Water quality scientist,job_Web designer,job_Wellsite geologist,job_Writer
0,4.97,2703186189652095,36.011293,-82.048315,28654,36.0788,-81.1781,3495,0.789499,1546300818000000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107.23,630423337322,49.159047,-118.186462,99160,48.8878,-118.2105,149,0.668418,1546300844000000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,220.11,38859492057661,43.150704,-112.154481,83252,42.1808,-112.262,4154,0.463819,1546300851000000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45.0,3534093764340240,47.034331,-112.561071,59632,46.2306,-112.1138,1939,0.525878,1546300876000000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41.96,375534208663984,38.674999,-78.632459,24433,38.4207,-79.4629,99,0.765208,1546300986000000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
print(df_train.columns)
print(df_test.columns)

Index(['amt', 'cc_num', 'merch_lat', 'merch_long', 'zip', 'lat', 'long',
       'city_pop', 'dob', 'trans_date_trans_time',
       ...
       'job_Visual merchandiser', 'job_Volunteer coordinator',
       'job_Warden/ranger', 'job_Warehouse manager',
       'job_Waste management officer', 'job_Water engineer',
       'job_Water quality scientist', 'job_Web designer',
       'job_Wellsite geologist', 'job_Writer'],
      dtype='object', length=577)
Index(['amt', 'cc_num', 'merch_lat', 'merch_long', 'zip', 'lat', 'long',
       'city_pop', 'dob', 'trans_date_trans_time',
       ...
       'job_Visual merchandiser', 'job_Volunteer coordinator',
       'job_Warden/ranger', 'job_Warehouse manager',
       'job_Waste management officer', 'job_Water engineer',
       'job_Water quality scientist', 'job_Web designer',
       'job_Wellsite geologist', 'job_Writer'],
      dtype='object', length=577)


In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 10000
net = nn.Sequential(
    nn.Linear(len(df_train.columns) - 1, 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
    nn.Sigmoid()
).to(device)


optimizer = torch.optim.Adam(net.parameters(), lr=1e-7)
criterion = nn.BCELoss()

In [73]:
class FraudDataset(torch.utils.data.Dataset):
    """FraudDataset Dataset - combination of features and labels

    Args:
        feature: Transaction detail tensors
        target: Tensor of labels corresponding to features

    Returns:
        None
    """

    def __init__(self, feature, target=None):
        self.X = feature
        self.Y = target

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.Y is None:
            return [self.X[idx]]
        return self.X[idx], self.Y[idx]

In [74]:
train_dataset = FraudDataset(torch.tensor(df_train.loc[:, df_train.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_train.loc[:, "is_fraud"].values, dtype=torch.float))
test_dataset = FraudDataset(torch.tensor(df_test.loc[:, df_test.columns != "is_fraud"].values, dtype=torch.float), torch.tensor(df_test.loc[:, "is_fraud"].values, dtype=torch.float))

In [75]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [81]:
epochs = 100
log_step = 10
running_loss = 0.0
running_acc = 0.0
for epoch in range(100):
    for phase in ['train', 'test']:
        print(phase)
        dataloader = train_dataloader if phase == 'train' else test_dataloader

        net.train() if phase == 'train'else net.eval()

        for j, batch in enumerate(dataloader):
            i = j + 1
            data, labels = batch[0].to(device), batch[1].to(device)
            
            if phase == 'train':
                optimizer.zero_grad()

            predictions = net(data)
            accuracy = float((predictions == labels.reshape(-1,1)).detach().cpu().numpy().sum()) / labels.shape[0]
            running_acc += accuracy

            if phase == 'train':
                cost = criterion(predictions, labels.reshape(-1,1))
                cost.backward()
                optimizer.step()

                running_loss += cost.cpu().detach().numpy() / data.size()[0]
                if i != 0 and i % log_step == 0:
                    training_loss = running_loss / log_step
                    training_acc = running_acc / log_step
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)}, Phase: {phase}, Loss: {training_loss}, Accuracy: {training_acc}"
                    )

                    running_loss = 0.0
                    running_acc = 0.0
            else:
                if i != 0 and i % log_step == 0:
                    training_acc = running_acc / log_step
                    print(
                        f"Epoch: {epoch}/{epochs}, Iteration: {i}/{len(dataloader)}, Phase: {phase}, Accuracy: {training_acc}"
                    )

                    running_acc = 0.0

train
Epoch: 0/100, Iteration: 10/130, Phase: train, Loss: 9.899999767541885e-05, Accuracy: 0.9901
Epoch: 0/100, Iteration: 20/130, Phase: train, Loss: 6.549999773502351e-05, Accuracy: 0.9934499999999999
Epoch: 0/100, Iteration: 30/130, Phase: train, Loss: 6.099999725818634e-05, Accuracy: 0.9939
Epoch: 0/100, Iteration: 40/130, Phase: train, Loss: 3.9899999424815176e-05, Accuracy: 0.9960099999999998
Epoch: 0/100, Iteration: 50/130, Phase: train, Loss: 3.869999848306179e-05, Accuracy: 0.99613
Epoch: 0/100, Iteration: 60/130, Phase: train, Loss: 4.769999884068967e-05, Accuracy: 0.99523
Epoch: 0/100, Iteration: 70/130, Phase: train, Loss: 6.599999845027924e-05, Accuracy: 0.9934
Epoch: 0/100, Iteration: 80/130, Phase: train, Loss: 5.1199998557567595e-05, Accuracy: 0.9948799999999999
Epoch: 0/100, Iteration: 90/130, Phase: train, Loss: 3.999999925494195e-05, Accuracy: 0.9960000000000001


KeyboardInterrupt: 