In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F

In [10]:
import community as com

In [11]:
df_A=pd.read_csv("/Users/corentinpla/Documents/C-2-Statap/transactions_A.csv")
df_B=pd.read_csv("/Users/corentinpla/Documents/C-2-Statap/transactions_B.csv")

In [12]:
def filter_df(time, df_to_filter):
    " Fonction qui renvoie un df trié en ne prenant que les dates du passé "
    " mettre le time sous forme 'yyyy-mm-dd' "
    df_1 = df_to_filter[df_to_filter['timestamp']<time].copy()
    return df_1

In [17]:
df_A=filter_df("2019-01-01",df_A)
df_B=filter_df("2019-01-01",df_B)

In [18]:
df_merge=df_A.merge(df_B, left_on='tx_id', right_on='tx_id')

In [34]:
df_merge.head(10)

Unnamed: 0,tx_id,sender_account_id_x,receiver_account_id_x,tx_amount_x,timestamp_x,tx_type_x,sender_bank_id_x,sender_first_name_x,sender_last_name_x,sender_country_code_x,...,receiver_first_name_y,receiver_last_name_y,receiver_country_code_y,receiver_birth_date_y,receiver_zip_code_y,is_alert_solo_y,is_alert_pooled_y,sender_account,receiver_account,tx_amount
0,1829,0.0,9011.0,117.83,2018-03-03,WIRE,B,Kurt,Brignac,FR,...,Joseph,Gable,FR,1933-12-23,62116,False,False,5754.0,9011.0,235.66
1,1830,0.0,7140.0,117.83,2018-03-03,WIRE,B,Kurt,Brignac,FR,...,Shelley,Spencer,FR,1944-12-23,34360,False,False,5754.0,7140.0,235.66
2,1838,2121.0,0.0,21.21,2018-03-03,WIRE,A,Gabrielle,Evans,FR,...,Jack,Abeles,FR,1960-11-20,89000,True,False,2121.0,3674.0,42.42
3,2735,9621.0,0.0,168.11,2018-03-03,WIRE,A,Billy,Dozier,FR,...,Jack,Abeles,FR,1960-11-20,89000,False,False,9621.0,3674.0,336.22
4,3351,0.0,9521.0,134.86,2018-03-03,WIRE,B,Myong,Byrd,FR,...,Grace,Davison,FR,1920-08-01,72120,False,False,9674.0,9521.0,269.72
5,4198,0.0,9521.0,25.8,2018-03-03,WIRE,B,Penny,Dobson,FR,...,Grace,Davison,FR,1920-08-01,72120,False,False,3184.0,9521.0,51.6
6,1824,0.0,8982.0,117.83,2018-03-03,WIRE,B,Kurt,Brignac,FR,...,Carol,Spain,FR,1930-02-06,2220,False,False,5754.0,8982.0,235.66
7,1801,0.0,6970.0,158.1,2018-03-03,WIRE,B,Alfred,Newberry,FR,...,Lois,Boulos,FR,1926-08-21,62270,False,False,3685.0,6970.0,316.2
8,1808,0.0,450.0,158.1,2018-03-03,WIRE,B,Alfred,Newberry,FR,...,Walter,Calhoun,FR,1943-06-29,41300,False,False,3685.0,450.0,316.2
9,1809,0.0,9873.0,502.7,2018-03-03,CHECK,B,Griselda,Beamon,FR,...,Allyson,Laxton,FR,1982-07-07,80135,False,False,1515.0,9873.0,1005.4


In [19]:
df_merge=df_merge.fillna(0)

In [20]:
df_merge["sender_account"]=df_merge["sender_account_id_x"]+df_merge["sender_account_id_y"]
df_merge["receiver_account"]=df_merge["receiver_account_id_x"]+df_merge["receiver_account_id_y"]
df_merge["tx_amount"]=df_merge["tx_amount_x"]+df_merge["tx_amount_y"]

In [33]:
#ML
#On oublie la variable temps 
#Plus un concatenate entre A et B mais un merge 
#Methode 1 : pytorch 
#Methode 2 : Inductive Graph Representation Learning for fraud detection
#création du graph
#multigraph orienté 
G=nx.MultiGraph()

#création des noeuds du graph:
G.add_nodes_from(df_merge["sender_account"].unique(), type='sender')
G.add_nodes_from(df_merge["receiver_account"].unique(), type='receiver')

#add edge 
a=0
for index,row in df_merge.iterrows ():
    a+=1

    amount=row["tx_amount"]

    G.add_edge(row['sender_account'],row["receiver_account"],amount = amount)

# Get the number of nodes and edges in the graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Print the number of nodes and edges
print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

adj_matrix = nx.adjacency_matrix(G).todense()

edge_list = list(G.edges(data=True))

Number of nodes: 5081
Number of edges: 45991


  adj_matrix = nx.adjacency_matrix(G).todense()


In [38]:
len(edge_list)

45991

In [50]:
class FraudGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FraudGNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x.squeeze(-1)

In [51]:
x_train = []
for edge in edge_list[:10000]:
    edge_values = list(edge[2].values())
    edge_values = [float(i[0]) if type(i) == tuple and type(i[0]) == str else i[0] if type(i) == tuple else i for i in edge_values]
    x_train.append(edge_values)
x_train = torch.tensor(x_train , dtype=torch.float)

x_test=[]
for edge in edge_list[10000:]:
    edge_values = list(edge[2].values())
    edge_values = [float(i[0]) if type(i) == tuple and type(i[0]) == str else i[0] if type(i) == tuple else i for i in edge_values]
    x_test.append(edge_values)
x_test = torch.tensor(x_test , dtype=torch.float)

In [52]:
y_train = torch.tensor(df_merge[:10000]['is_alert_pooled_y'].values, dtype=torch.float)
y_test=torch.tensor(df_merge[10000:]['is_alert_pooled_y'].values, dtype=torch.float)

In [55]:
# Define the model
input_dim = len(x_train[0])
hidden_dim = 16
model = FraudGNN(input_dim, hidden_dim)
num_epochs=201

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [56]:
# Train the model
for i in range(num_epochs):
    # Forward pass
    output = model(x_train)
    # Compute the loss
    loss = criterion(output, y_train)
    if i % 20 == 0:
        print(f'Epoch: {i}, Loss: {loss.item()}')
    # Zero the gradients
    optimizer.zero_grad()
    # Perform backpropagation
    loss.backward()
    # Update the parameters
    optimizer.step()

Epoch: 0, Loss: 122.7416763305664
Epoch: 20, Loss: 0.1649675965309143
Epoch: 40, Loss: 0.2023872286081314
Epoch: 60, Loss: 0.20553725957870483
Epoch: 80, Loss: 0.20295275747776031
Epoch: 100, Loss: 0.19932691752910614
Epoch: 120, Loss: 0.195427805185318
Epoch: 140, Loss: 0.19141945242881775
Epoch: 160, Loss: 0.18736447393894196
Epoch: 180, Loss: 0.18329878151416779
Epoch: 200, Loss: 0.17924432456493378


In [60]:
model(x_test)

tensor([ -202.9000,   -42.6846,   -42.6846,  ...,   -22.7400,    -7.2805,
        -5677.3838], grad_fn=<SqueezeBackward1>)

In [62]:
#########################################new model##################################
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

In [64]:
dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [68]:
print(dataset.y)

tensor([3, 4, 4,  ..., 3, 3, 3])


In [63]:
df_A.num_features

AttributeError: 'DataFrame' object has no attribute 'num_features'