## Join JPMorgan and ICIJ to create User Mapping

In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
import shutil

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
# File path in a Linux environment
file_path = 'Thesis/aml_syn_data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Perform the data cleaning operations
df['Sender_Country'] = df['Sender_Country'].str.title()
df['Bene_Country'] = df['Bene_Country'].str.title()
df['Sender_Country'] = df['Sender_Country'].replace({'Usa': 'United States', 'South-Korea': 'South Korea'})
df['Bene_Country'] = df['Bene_Country'].replace({'Usa': 'United States', 'South-Korea': 'South Korea'})

df['Label'] = df['Label'].replace({'BAD': 1, 'GOOD': 0})

# Assuming the dataframe has a column named 'Time_step' for separating the years
df['Time_step'] = pd.to_datetime(df['Time_step'])

# Save the cleaned DataFrame to a new CSV file
output_file_path = 'Thesis/transactions.csv'
df.to_csv(output_file_path, index=False)

# Display DataFrame info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484536 entries, 0 to 1484535
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   Time_step           1484536 non-null  datetime64[ns]
 1   Label               1484536 non-null  int64         
 2   Transaction_Id      1484536 non-null  object        
 3   Sender_Id           1119436 non-null  object        
 4   Sender_Account      1119436 non-null  object        
 5   Sender_Institution  1119436 non-null  object        
 6   Sender_Country      1119436 non-null  object        
 7   USD_amount          1484536 non-null  float64       
 8   Bene_Id             1169913 non-null  object        
 9   Bene_Account        921185 non-null   object        
 10  Bene_Institution    921185 non-null   object        
 11  Bene_Country        921185 non-null   object        
 12  Transaction_Type    1484536 non-null  object        
dtypes: datetime6

In [3]:

df_sender = df[['Sender_Country', 'Sender_Id', 'Label']].rename(columns={'Sender_Country': 'Country', 'Sender_Id': 'User_Id'})
df_beneficiary = df[['Bene_Country', 'Bene_Id', 'Label']].rename(columns={'Bene_Country': 'Country', 'Bene_Id': 'User_Id'})

# Concatenate the sender and beneficiary dataframes
df_combined = pd.concat([df_sender, df_beneficiary])

# Drop duplicates and NaN values
df_combined = df_combined.drop_duplicates().dropna()

# Add a Type column based on the ID containing specific substrings
def determine_type(id_value):
    if pd.isna(id_value):
        return 'UNKNOWN'
    id_value = str(id_value).upper()
    if 'COMPANY' in id_value:
        return 'entities'
    elif 'OWNER' in id_value or 'CLIENT' in id_value or 'CUSTOMER' in id_value:
        return 'officer'
    elif 'BILLING' in id_value:
        return 'intermediaries'
    else:
        return 'UNKNOWN'

df_combined['Type'] = df_combined['User_Id'].apply(determine_type)

# Calculate unique users per country
unique_users_per_country = df_combined.groupby(['Country', 'Type'])['User_Id'].nunique().reset_index()
unique_users_per_country.rename(columns={'User_Id': 'Unique_User_Count'}, inplace=True)

# Display the results
print("Unique Users per Country and Type:")
print(unique_users_per_country)


Unique Users per Country and Type:
              Country      Type  Unique_User_Count
0           Argentina  entities                 81
1           Argentina   officer                186
2           Australia  entities                 56
3           Australia   officer                162
4              Canada  entities                 62
..                ...       ...                ...
61     United-Kingdom   officer              13271
62          Venezuela  entities                 70
63          Venezuela   officer                235
64  Virgin-Islands-Us  entities                 63
65  Virgin-Islands-Us   officer                203

[66 rows x 3 columns]


In [4]:
# Aggregate the maximum Label status for senders and beneficiaries
aggregated_sender_data = df_sender.groupby(['User_Id', 'Country']).agg({
    'Label': 'max'
}).reset_index()

aggregated_beneficiary_data = df_beneficiary.groupby(['User_Id', 'Country']).agg({
    'Label': 'max'
}).reset_index()

# Drop duplicates and NaN values from aggregated data
aggregated_sender_data = aggregated_sender_data.drop_duplicates().dropna()
aggregated_beneficiary_data = aggregated_beneficiary_data.drop_duplicates().dropna()


# Assuming the determine_type function is defined elsewhere and correctly identifies 'Officer'
aggregated_sender_data['Type'] = aggregated_sender_data['User_Id'].apply(determine_type)
aggregated_beneficiary_data['Type'] = aggregated_beneficiary_data['User_Id'].apply(determine_type)

# Combine the aggregated sender and beneficiary dataframes
combined_aggregated_data = pd.concat([aggregated_sender_data, aggregated_beneficiary_data])

# Sort by User_Id, Country, and Label (in descending order so that 1 comes before 0)
combined_aggregated_data = combined_aggregated_data.sort_values(by=['User_Id', 'Country', 'Label'], ascending=[True, True, False])

# Drop duplicates, keeping the first occurrence
combined_aggregated_data = combined_aggregated_data.drop_duplicates(subset=['User_Id', 'Country'], keep='first')

# Display the combined data
print("Combined Aggregated Data:")
print(combined_aggregated_data.head(10))



Combined Aggregated Data:
                      User_Id        Country  Label      Type
0  BILLING-COMPANY-1000026-06  United States      0  entities
1  BILLING-COMPANY-1000138-00  United States      1  entities
2  BILLING-COMPANY-1000188-04  United States      0  entities
0  BILLING-COMPANY-1000282-05  United States      0  entities
4  BILLING-COMPANY-1000643-04  United States      0  entities
1  BILLING-COMPANY-1000719-05  United States      0  entities
6  BILLING-COMPANY-1000913-10  United States      0  entities
7  BILLING-COMPANY-1001091-11  United States      0  entities
2   BILLING-COMPANY-100157-13  United States      0  entities
9  BILLING-COMPANY-1001571-00  United States      1  entities


In [5]:


# Ensure reproducibility
np.random.seed(42)

combined_aggregated_data['is_pep'] = 0

# Filter for officers
officers_indices = combined_aggregated_data[combined_aggregated_data['Type'] == 'officer'].index

# Function to randomly assign is_pep=1 to 10% of the rows
def assign_pep_flag(indices):
    num_pep = max(1, int(0.1 * len(indices)))  # Ensure at least one row is selected if the subset is small
    pep_indices = np.random.choice(indices, num_pep, replace=False)
    return pep_indices

# Apply the function to the officers indices
pep_indices = assign_pep_flag(officers_indices)

# Update the is_pep values in the combined_aggregated_data DataFrame
combined_aggregated_data.loc[pep_indices, 'is_pep'] = 1

# Count the number of officers with each label
officers_data = combined_aggregated_data.loc[officers_indices]
officers_label_count = officers_data['Label'].value_counts().reset_index()
officers_label_count.columns = ['Label', 'Count']

# Display the count and a sample of the modified dataframe
print("Number of officers with each label:")
print(officers_label_count)
print("\nSample of the modified combined aggregated data with is_pep flag:")
print(combined_aggregated_data.head(10))
overall_pep_count = combined_aggregated_data.groupby(['Label', 'is_pep']).size().reset_index(name='Count')

print("\nOverall count of pep status by label:")
print(overall_pep_count)

Number of officers with each label:
   Label   Count
0      0  177115
1      1   44759

Sample of the modified combined aggregated data with is_pep flag:
                      User_Id        Country  Label      Type  is_pep
0  BILLING-COMPANY-1000026-06  United States      0  entities       0
1  BILLING-COMPANY-1000138-00  United States      1  entities       0
2  BILLING-COMPANY-1000188-04  United States      0  entities       0
0  BILLING-COMPANY-1000282-05  United States      0  entities       0
4  BILLING-COMPANY-1000643-04  United States      0  entities       0
1  BILLING-COMPANY-1000719-05  United States      0  entities       0
6  BILLING-COMPANY-1000913-10  United States      0  entities       0
7  BILLING-COMPANY-1001091-11  United States      0  entities       0
2   BILLING-COMPANY-100157-13  United States      0  entities       0
9  BILLING-COMPANY-1001571-00  United States      1  entities       0

Overall count of pep status by label:
   Label  is_pep   Count
0      0    

In [6]:
grouped_data = combined_aggregated_data.groupby(['Country', 'Type']).size().reset_index(name='Count')

# Convert grouped data to dictionary for filtering nodes
grouped_dict = {(row['Type'], row['Country']): row['Count'] for _, row in grouped_data.iterrows()}
grouped_dict

{('entities', 'Argentina'): 81,
 ('officer', 'Argentina'): 186,
 ('entities', 'Australia'): 56,
 ('officer', 'Australia'): 162,
 ('entities', 'Canada'): 62,
 ('officer', 'Canada'): 186,
 ('entities', 'Chile'): 60,
 ('officer', 'Chile'): 182,
 ('entities', 'China'): 56,
 ('officer', 'China'): 207,
 ('entities', 'France'): 64,
 ('officer', 'France'): 197,
 ('entities', 'Germany'): 76,
 ('officer', 'Germany'): 236,
 ('entities', 'India'): 72,
 ('officer', 'India'): 234,
 ('entities', 'Iran'): 2348,
 ('officer', 'Iran'): 2280,
 ('entities', 'Isle-Of-Man'): 56,
 ('officer', 'Isle-Of-Man'): 182,
 ('entities', 'Israel'): 59,
 ('officer', 'Israel'): 173,
 ('entities', 'Italy'): 79,
 ('officer', 'Italy'): 212,
 ('entities', 'Luxembourg'): 56,
 ('officer', 'Luxembourg'): 182,
 ('entities', 'Marocco'): 74,
 ('officer', 'Marocco'): 195,
 ('entities', 'Mexico'): 59,
 ('officer', 'Mexico'): 194,
 ('entities', 'Nicaragua'): 59,
 ('officer', 'Nicaragua'): 183,
 ('entities', 'Nigeria'): 63,
 ('officer'

In [7]:


# Define file paths
base_path = 'Thesis/'
relationships_file = base_path + 'relationships.csv'
officers_file = base_path + 'nodes-officers.csv'
intermediaries_file = base_path + 'nodes-intermediaries.csv'
entities_file = base_path + 'nodes-entities.csv'
output_nodes_file = base_path + 'usa_nodes.csv'

# Function to generate fake entities
def generate_fake_entity(entity_type):
    fake = Faker()
    if entity_type.lower() == 'entities':
        return fake.company()
    elif entity_type.lower() == 'officer':
        return fake.name()
    else:
        return 'Invalid entity type. Please specify "company" or "person".'

# Function to load and clean data
def load_clean_data():
    edges = pd.read_csv(relationships_file)
    if 'rel_type' in edges.columns:
        edges = edges[edges["rel_type"] != "registrated address"]

    officers = pd.read_csv(officers_file)
    officers = officers.dropna(subset=['name', 'country_codes'])
    intermediaries = pd.read_csv(intermediaries_file)
    intermediaries = intermediaries.dropna(subset=['name', 'countries'])
    entities = pd.read_csv(entities_file)
    entities = entities.dropna(subset=['name', 'countries'])
    entities = entities[~entities['countries'].str.contains(';', na=False)]
    entities = entities[entities['country_codes'] != 'XXX']
    entities['countries'] = entities['countries'].replace({
        'Hong Kong': 'United States',
        'Monaco': 'United States',
        'United Kingdom': 'United-Kingdom',
        'South Africa': 'South-Africa',
        'Isle of Man': 'Isle-Of-Man',
        'Samoa': 'United States',
        'Cayman Islands': 'United States',
        'Taiwan': 'North-Korea',
        'Peru': 'United States',
        'Bermuda': 'United States',
        'British Virgin Islands': 'United States',
        'U.S. Virgin Islands': 'Virgin-Islands-Us',
        'Bahamas': 'Iran',
        'Malta': 'Iran',
        'Czech Republic': 'United States',
        'Seychelles': 'North-Korea',
        'Malaysia': 'Marocco',
        'Hungary': 'Nicaragua',
        'Mauritius': 'Iran',
        'Ecuador': 'Syria',
        'Barbados': 'Syria',
        'Lebanon': 'Syria',
        'Singapore': 'China',
        'Lithuania': 'North-Korea',
        'United Arab Emirates': 'Syria',
        'Anguilla': 'Virgin-Islands-Us',
        'Cyprus': 'United-Kingdom',
        'Cook Islands': 'United-Kingdom',
        'Belize': 'United States',
        'Bolivia': 'Iran',
        'Indonesia': 'India',
        'Brazil': 'Syria',
        'Denmark': 'Luxembourg',
        'Egypt': 'Venezuela',
        'Albania':'Italy',
        'Ukraine':'Italy',
        'Thailand':'Italy',
        'Uruguay':'Luxembourg',
        'Cayman Island':'Syria',
        'Malta':'Qatar',
        'Gabon':'Nigeria',
        'Poland':'South Korea',
        'Costa Rica':'Virgin-Islands-Us',
        'Cyprus':'Nigeria'
    })

    entities['country_codes'] = entities['country_codes'].replace({
        'HKG': 'USA',
        'MCO': 'USA',
        'GBR': 'USA',
        'WSM': 'USA',
        'CYM': 'USA',
        'TWN': 'NK',
        'BMU': 'USA',
        'VGB': 'USA',
        'BHS': 'IRN',
        'MLT': 'IRN',
        'CZE': 'USA',
        'SYC': 'NK',
        'MYS': 'MAR',
        'HUN': 'NIC',
        'MUS': 'IRN',
        'ECU': 'SYR',
        'BRB': 'SYR',
        'LBN': 'SYR',
        'SGP': 'CHN',
        'LTU': 'NK',
        'UAE': 'SYR',
        'AIA': 'VGB',
        'CYP': 'GBR',
        'COK': 'GBR',
        'BLZ': 'USA',
        'BOL': 'IRN',
        'IDN': 'IND',
        'AIA':'ITA',
        'UKR':'ITA',
        'THA':'ITA',
        'URY':':LUX',
        'CYM':'SYR',
        'MLT':'QAT',
        'GAB':'NGA',
        'POL':'KOR',
        'CRI':'VGB',
        'CYP':'NGA'
    })

    officers = officers.drop(columns=['valid_until', 'note', 'sourceID'])
    officers = officers[~officers['countries'].str.contains(';', na=False)]
    officers = officers[officers['country_codes'] != 'XXX']
    officers['countries'] = officers['countries'].replace({
        'Hong Kong': 'United States',
        'Isle of Man': 'Isle-Of-Man',
        'Seychelles': 'North-Korea',
        'United Kingdom': 'United-Kingdom',
        'South Africa': 'South-Africa',
        'Malaysia': 'Marocco',
        'Monaco': 'United States',
        'Samoa': 'United States',
        'Cayman Islands': 'United States',
        'Taiwan': 'United States',
        'Peru': 'United States',
        'Bermuda': 'United States',
        'British Virgin Islands': 'United States',
        'U.S. Virgin Islands': 'Virgin-Islands-Us',
        'Bahamas': 'United States',
        'Malta': 'United States',
        'Czech Republic': 'United States',
        'Hungary': 'Nicaragua',
        'Mauritius': 'Iran',
        'Ecuador': 'Syria',
        'Barbados': 'Syria',
        'Lebanon': 'Syria',
        'El Salvador': 'Syria',
        'Brazil': 'Syria',
        'Indonesia': 'Iran'
    })

    officers['country_codes'] = officers['country_codes'].replace({
        'HKG': 'USA',
        'SYC': 'NK',
        'MYS': 'MAR',
        'MCO': 'USA',
        'WSM': 'USA',
        'CYM': 'USA',
        'TWN': 'USA',
        'MUS':'IRN',
        'IDN':'IRN'
    })

    intermediaries = intermediaries.drop(columns=['valid_until', 'note', 'sourceID'])
    intermediaries = intermediaries[~intermediaries['countries'].str.contains(';', na=False)]
    intermediaries = intermediaries[intermediaries['country_codes'] != 'XXX']
    intermediaries['countries'] = intermediaries['countries'].replace({
        'Malaysia': 'Iran',
        'Seychelles': 'North-Korea',
        'Hong Kong': 'China',
        'Isle of Man': 'Isle-Of-Man',
        'United Kingdom': 'United-Kingdom',
        'South Africa': 'South-Africa',
        'Monaco': 'Iran',
        'Luxembourg': 'Iran',
        'Austria': 'Luxembourg',
        'Samoa': 'North-Korea',
        'Cayman Islands': 'Iran',
        'Taiwan': 'North-Korea',
        'Peru': 'Syria',
        'Bermuda': 'Syria',
        'Hungary': 'Iran',
        'Mauritius': 'Iran',
        'Bahamas': 'Iran',
        'Malta': 'Iran',
        'Czech Republic': 'North-Korea',
        'Ecuador': 'Syria',
        'Barbados': 'Syria',
        'Lebanon': 'Syria',
        'Andorra': 'India',
        'Aruba': 'Virgin-Islands-Us',
        'Cuba': 'Qatar',
        'China': 'North-Korea',
        'United States': 'Syria',
        'Venezuela': 'Syria'
    })

    intermediaries['country_codes'] = intermediaries['country_codes'].replace({
        'MYS': 'IRN',
        'SYC': 'NK',
        'HKG': 'CHN',
        'MCO': 'IRN',
        'LUX': 'IRN',
        'AUT': 'LUX',
        'WSM': 'NK',
        'CYM': 'IRN',
        'TWN': 'NK',
        'PER': 'SYR',
        'BMU': 'SYR',
        'HUN': 'IRN',
        'MUS': 'IRN',
        'BHS': 'IRN',
        'MLT': 'IRN',
        'CZE': 'NK',
        'ECU': 'SYR',
        'BRB': 'SYR',
        'LBN': 'SYR',
        'AND': 'IND',
        'ABW': 'VGB',
        'CUB': 'QAT',
        'CHN': 'NK',
        'USA': 'SYR',
        'VEN': 'SYR',
        'BRA': 'SYR',
        'DNK': 'LUX',
        'EGY': 'VEN',
        'BOL': 'VGB'
    })

    officers['type'] = "officer"
    intermediaries['type'] = "intermediary"
    entities['type'] = "entities"

    columns = ['node_id', 'name','type', 'countries', 'country_codes']

    officers = officers[columns]
    intermediaries = intermediaries[columns]
    entities = entities[columns]

    all_nodes = pd.concat([officers, intermediaries, entities]).reset_index(drop=True)

    all_nodes["name"] = all_nodes["name"].str.upper().str.replace(' ', '_')
    all_nodes["name"].replace(
        to_replace=[r"MRS?\.\s+", r"\.", r"\s+", "LIMITED", "THE BEARER", "BEARER", "BEARER 1", "EL PORTADOR", "AL PORTADOR"],
        value=["", "", "", "LTD", np.nan, np.nan, np.nan, np.nan, np.nan],
        inplace=True, regex=True)
    all_nodes = all_nodes[~all_nodes.index.duplicated(keep='first')]
    all_nodes = all_nodes[all_nodes['name'].str.strip() != '']
    all_nodes['countries'] = all_nodes['countries'].str.title()

    return all_nodes, edges

def filter_save_usa_and_foreign_nodes(grouped_dict, all_nodes):
    OUTPUT_NODES_FILE = '/content/drive/My Drive/Thesis/DataCleaning/usa_nodes.csv'
    all_nodes['type'] = all_nodes['type'].replace('entity', 'entities')
    filtered_nodes_list = []

    for (entity_type, country), count in grouped_dict.items():
        entity_type_lower = entity_type.lower()
        filtered_nodes = all_nodes[(all_nodes['type'] == entity_type_lower) & (all_nodes['countries'] == country)]

        # Replace NaN values in 'name' with an empty string for length check
        filtered_nodes['name'] = filtered_nodes['name'].fillna('')

        # Generate fake entities for names with length <= 3
        filtered_nodes['name'] = filtered_nodes['name'].apply(
            lambda name: generate_fake_entity(entity_type_lower) if len(name) <= 3 else name
        )

        # If filtered_nodes is empty and the entity_type is 'entities', use intermediaries
        if len(filtered_nodes) < count and entity_type_lower == 'entities':
            intermediaries_as_entities = all_nodes[(all_nodes['type'] == 'intermediary') & (all_nodes['countries'] == country)]
            intermediaries_as_entities['type'] = 'entities'
            # Fill NaN values in the 'name' column with an empty string
            intermediaries_as_entities['name'].fillna('', inplace=True)

            # Use .loc to apply the function to rows where the length of 'name' is less than or equal to 3
            intermediaries_as_entities.loc[intermediaries_as_entities['name'].str.len() <= 3, 'name'] = intermediaries_as_entities.loc[intermediaries_as_entities['name'].str.len() <= 3, 'name'].apply(lambda name: generate_fake_entity('entities'))

            filtered_nodes = pd.concat([filtered_nodes, intermediaries_as_entities])

        # If more nodes are found than needed, sample the specified count
        if len(filtered_nodes) >= count:
            filtered_nodes = filtered_nodes.sample(n=count, random_state=42)

        filtered_nodes_list.append(filtered_nodes)

        # Check if the actual count matches the expected count
        if len(filtered_nodes) >= count:
            print(f"{entity_type} in {country}: match")
        else:
            print(f"{entity_type} in {country}: not match (Needed = {count}, Available = {len(filtered_nodes)})")

    # Concatenate all filtered nodes into a single DataFrame
    all_filtered_nodes = pd.concat(filtered_nodes_list)
    return all_filtered_nodes

# Assuming 'grouped_data' is already defined as a DataFrame
grouped_dict = {(row['Type'], row['Country']): row['Count'] for _, row in grouped_data.iterrows()}

# Load data
all_nodes, edges = load_clean_data()

# Filter and save nodes
filtered_nodes = filter_save_usa_and_foreign_nodes(grouped_dict, all_nodes)
filtered_nodes.to_csv(output_nodes_file, index=False)


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


entities in Argentina: match
officer in Argentina: match
entities in Australia: match
officer in Australia: match
entities in Canada: match
officer in Canada: match
entities in Chile: match
officer in Chile: match
entities in China: match
officer in China: match
entities in France: match
officer in France: match
entities in Germany: match
officer in Germany: match
entities in India: match


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

officer in India: match
entities in Iran: match
officer in Iran: match
entities in Isle-Of-Man: match
officer in Isle-Of-Man: match
entities in Israel: match
officer in Israel: match
entities in Italy: match
officer in Italy: match
entities in Luxembourg: match
officer in Luxembourg: match
entities in Marocco: match
officer in Marocco: match
entities in Mexico: match
officer in Mexico: match
entities in Nicaragua: match
officer in Nicaragua: match
entities in Nigeria: match
officer in Nigeria: match
entities in North-Korea: match
officer in North-Korea: match
entities in Panama: match
officer in Panama: match
entities in Portugal: match
officer in Portugal: match
entities in Qatar: match
officer in Qatar: match
entities in Russia: match
officer in Russia: match
entities in Singapore: match


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

officer in Singapore: match
entities in South Korea: match
officer in South Korea: match
entities in South-Africa: match
officer in South-Africa: match
entities in Spain: match
officer in Spain: match
entities in Sweden: match


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

officer in Sweden: match
entities in Switzerland: match
officer in Switzerland: match
entities in Syria: match
officer in Syria: match
entities in United States: match
officer in United States: match
entities in United-Kingdom: match
officer in United-Kingdom: match
entities in Venezuela: match
officer in Venezuela: match
entities in Virgin-Islands-Us: match
officer in Virgin-Islands-Us: match


In [8]:
import pandas as pd

def create_mappings(filtered_df, nodes_pd, used_nodes):
    mapping_list = []

    # Rename columns for consistency
    filtered_df = filtered_df.rename(columns={'countries': 'Country', 'type': 'Type'})
    nodes_pd = nodes_pd.rename(columns={ 'Type': 'Type', 'Country': 'Country'})

    # Get unique country-type combinations
    country_type_combinations = nodes_pd[['Country', 'Type']].drop_duplicates().to_dict('records')
    print(f"Unique country-type combinations in nodes_pd: {country_type_combinations}")

    # Convert nodes to a dictionary for faster lookups
    nodes_dict = filtered_df.to_dict('records')
    nodes_by_country_type = {}
    for node in nodes_dict:
        country_type_key = (node['Country'], node['Type'])
        if country_type_key not in nodes_by_country_type:
            nodes_by_country_type[country_type_key] = []
        nodes_by_country_type[country_type_key].append(node)

    for combo in country_type_combinations:
        country = combo['Country']
        type_ = combo['Type']
        country_type_key = (country, type_)

        # Filter individuals by country and type
        country_type_individuals_df = nodes_pd[(nodes_pd['Country'] == country) & (nodes_pd['Type'] == type_)]

        # Get nodes for the country and type, excluding already used nodes
        available_nodes = [node for node in nodes_by_country_type.get(country_type_key, []) if node['node_id'] not in used_nodes]

        print(f"Processing country: {country}, type: {type_}")
        print(f"Number of individuals in {country} of type {type_}: {len(country_type_individuals_df)}")
        print(f"Number of available nodes in {country} of type {type_}: {len(available_nodes)}")

        if available_nodes:
            # Sample nodes for each customer
            for _, customer_row in country_type_individuals_df.iterrows():
                if available_nodes:
                    sampled_node = available_nodes.pop(0)
                    node_id = sampled_node['node_id']

                    # Mark the node as used
                    used_nodes.add(node_id)

                    # Map Customer_Id to the sampled node
                    customer_id = customer_row['User_Id']
                    mapping_list.append({
                        'User_Id': customer_id,
                        'node_id': sampled_node['node_id'],
                        'name': sampled_node['name'],
                        'type': sampled_node['Type'],
                        'country_codes': sampled_node['country_codes'],
                        'countries': sampled_node['Country'],
                        'is_pep': customer_row['is_pep']
                    })
                else:
                    print(f"No available nodes left for country: {country}, type: {type_}")
        else:
            print(f"No nodes available for country: {country}, type: {type_}")

    return mapping_list

filtered_df = pd.DataFrame(filtered_nodes)
nodes_pd = pd.DataFrame(combined_aggregated_data)
used_nodes = set()

# Create mappings
mappings = create_mappings(filtered_df, nodes_pd, used_nodes)
mappings_df = pd.DataFrame(mappings)


Unique country-type combinations in nodes_pd: [{'Country': 'United States', 'Type': 'entities'}, {'Country': 'United-Kingdom', 'Type': 'entities'}, {'Country': 'Italy', 'Type': 'entities'}, {'Country': 'Nicaragua', 'Type': 'entities'}, {'Country': 'Panama', 'Type': 'entities'}, {'Country': 'Luxembourg', 'Type': 'entities'}, {'Country': 'France', 'Type': 'entities'}, {'Country': 'South-Africa', 'Type': 'entities'}, {'Country': 'Singapore', 'Type': 'entities'}, {'Country': 'Portugal', 'Type': 'entities'}, {'Country': 'Qatar', 'Type': 'entities'}, {'Country': 'Canada', 'Type': 'entities'}, {'Country': 'Germany', 'Type': 'entities'}, {'Country': 'India', 'Type': 'entities'}, {'Country': 'China', 'Type': 'entities'}, {'Country': 'Spain', 'Type': 'entities'}, {'Country': 'Australia', 'Type': 'entities'}, {'Country': 'Israel', 'Type': 'entities'}, {'Country': 'Virgin-Islands-Us', 'Type': 'entities'}, {'Country': 'Isle-Of-Man', 'Type': 'entities'}, {'Country': 'Mexico', 'Type': 'entities'}, {'

In [9]:
import random

mappings_df = mappings_df.rename(columns={'User_Id': 'account_number'})

# Function to generate unique 6-digit User_Id
def generate_unique_ids(n):
    ids = set()
    while len(ids) < n:
        new_id = random.randint(100000, 999999)
        ids.add(new_id)
    return list(ids)

# Generate unique 6-digit User_Id values
num_records = len(mappings_df)
new_user_ids = generate_unique_ids(num_records)
# mappings_df = mappings_df.drop(columns=['user_Id'])

# Assign new User_Id values to the DataFrame
mappings_df['customer_id'] = new_user_ids

# Print the updated DataFrame
print(mappings_df)
mappings_df.to_csv('Thesis/mappings.csv', index=False)
print("Mappings DataFrame saved to 'mappings.csv'.")

                    account_number   node_id  \
0       BILLING-COMPANY-1000026-06  10206359   
1       BILLING-COMPANY-1000138-00  10023735   
2       BILLING-COMPANY-1000188-04    163868   
3       BILLING-COMPANY-1000282-05  10057656   
4       BILLING-COMPANY-1000643-04    156515   
...                            ...       ...   
222755  STANDARD-COMPANY-993341-00  30019278   
222756  STANDARD-COMPANY-996272-00  30020097   
222757   STANDARD-COMPANY-99731-00  30009231   
222758   STANDARD-COMPANY-99756-00  30014526   
222759  STANDARD-COMPANY-999187-00  30020943   

                                              name      type country_codes  \
0       KINGSTONE_INTERNATIONAL_INVESTMENT_CO,_LTD  entities           USA   
1                             BM_ENTERTAINMENT_LTD  entities           USA   
2                ALYSSON_INTERNATIONAL_CORPORATION  entities           USA   
3                                       TAMWIN_LTD  entities           USA   
4                              TR

In [10]:
total_users = len(mappings_df)
print("Total Number of Users:", total_users)

# Count the number of PEPs (is_pep = 1) and non-PEPs (is_pep = 0)
pep_counts = mappings_df['is_pep'].value_counts()
print("PEP Counts:\n", pep_counts)

Total Number of Users: 222760
PEP Counts:
 0    200957
1     21803
Name: is_pep, dtype: int64


## Masking

In [11]:


mappings_df = pd.read_csv('Thesis/mappings.csv')

# Initialize Faker
fake = Faker()

# Sets to keep track of generated names to ensure uniqueness
generated_company_names = set()
generated_person_names = set()

# Function to generate a unique company name with a two-digit suffix
def generate_unique_company_name():
    name = fake.company() + " LTD " + str(random.randint(10, 99))
    while name in generated_company_names:
        name = fake.company() + " LTD " + str(random.randint(10, 99))
    generated_company_names.add(name)
    return name

# Function to generate a unique person name with a two-digit suffix
def generate_unique_person_name():
    name = fake.name() + " " + str(random.randint(10, 99))
    while name in generated_person_names:
        name = fake.name() + " " + str(random.randint(10, 99))
    generated_person_names.add(name)
    return name

# Mask customer_name based on customer_type
def mask_customer_name(row):
    if row['type'] == 'entities':
        return generate_unique_company_name()
    elif row['type'] == 'officer':
        return generate_unique_person_name()
    else:
        return row['name']  # Keep the original name if type is not matched

# Function to generate a dummy account part
def generate_dummy_account_part():
    return str(random.randint(1000, 9999))

# Mask the last two parts of the account number
def mask_account_number(account_number):
    parts = account_number.split('-')
    if len(parts) > 2:
        parts[-2] = generate_dummy_account_part()  # Replace the second-to-last part
        parts[-1] = generate_dummy_account_part()  # Replace the last part
    return '-'.join(parts)

mappings_df['name_masked'] = mappings_df.apply(mask_customer_name, axis=1)
mappings_df['account_number_masked'] = mappings_df['account_number'].apply(mask_account_number)


# Print the updated DataFrame
print(mappings_df)

# Save the masked mappings DataFrame to a CSV file
mappings_df.to_csv('Thesis/masked_mappings.csv', index=False)
print("Masked mappings DataFrame saved to 'masked_mappings.csv'.")


                    account_number   node_id  \
0       BILLING-COMPANY-1000026-06  10206359   
1       BILLING-COMPANY-1000138-00  10023735   
2       BILLING-COMPANY-1000188-04    163868   
3       BILLING-COMPANY-1000282-05  10057656   
4       BILLING-COMPANY-1000643-04    156515   
...                            ...       ...   
222755  STANDARD-COMPANY-993341-00  30019278   
222756  STANDARD-COMPANY-996272-00  30020097   
222757   STANDARD-COMPANY-99731-00  30009231   
222758   STANDARD-COMPANY-99756-00  30014526   
222759  STANDARD-COMPANY-999187-00  30020943   

                                              name      type country_codes  \
0       KINGSTONE_INTERNATIONAL_INVESTMENT_CO,_LTD  entities           USA   
1                             BM_ENTERTAINMENT_LTD  entities           USA   
2                ALYSSON_INTERNATIONAL_CORPORATION  entities           USA   
3                                       TAMWIN_LTD  entities           USA   
4                              TR

## Join JPMorgan and User (Masking)

In [12]:
transactions_df = pd.read_csv('Thesis/transactions.csv')

# Read the masked_mappings.csv file
mappings_df = pd.read_csv('Thesis/masked_mappings.csv')
# Join transactions_df with mappings_df on sender_id and Bene_id to account_number
sender_mappings_df = mappings_df.add_suffix('_Sender')
bene_mappings_df = mappings_df.add_suffix('_Bene')

# Perform the joins
joined_df = transactions_df.merge(sender_mappings_df, left_on='Sender_Id', right_on='account_number_Sender', how='left')
joined_df = joined_df.merge(bene_mappings_df, left_on='Bene_Id', right_on='account_number_Bene', how='inner')



In [13]:
columns_to_drop = [
    'Bene_Id', 'node_id_Sender', 'type_Sender', 'country_codes_Sender', 'countries_Sender','account_number_Sender','name_Sender',
    'Sender_Id', 'node_id_Bene', 'type_Bene', 'country_codes_Bene', 'countries_Bene','account_number_Bene','name_Bene'
]
joined_df = joined_df.drop(columns=columns_to_drop)
joined_df.head()

Unnamed: 0,Time_step,Label,Transaction_Id,Sender_Account,Sender_Institution,Sender_Country,USD_amount,Bene_Account,Bene_Institution,Bene_Country,Transaction_Type,is_pep_Sender,customer_id_Sender,name_masked_Sender,account_number_masked_Sender,is_pep_Bene,customer_id_Bene,name_masked_Bene,account_number_masked_Bene
0,2022-01-07 00:02:00,0,T-174791-02,CHECKING-174582-02,JPMORGANCHASE,United States,0.0,CHECKING-174582-02,JPMORGANCHASE,United States,KYC-ADD-ACCOUNT-OWNER,0.0,191692.0,Tanya Lopez 96,JPMC-CLIENT-4323-9405,0,148629,Christopher Brown 19,CUSTOMER-7684-8182
1,2022-01-07 00:02:00,0,T-105637-03,CHECKING-105426-03,JPMORGANCHASE,United States,0.0,CHECKING-105426-03,JPMORGANCHASE,United States,KYC-ADD-ACCOUNT-OWNER,0.0,177524.0,Amber Stokes 18,JPMC-CLIENT-9768-4398,0,138470,Ryan White 87,CUSTOMER-4405-5677
2,2022-07-24 00:51:00,0,T-105642-03,CHECKING-105425-03,JPMORGANCHASE,United States,0.0,CHECKING-105425-03,JPMORGANCHASE,United States,KYC-ADD-ACCOUNT-OWNER,0.0,177524.0,Amber Stokes 18,JPMC-CLIENT-9768-4398,0,138470,Ryan White 87,CUSTOMER-4405-5677
3,2022-01-07 00:02:00,0,T-235858-04,CHECKING-235577-04,JPMORGANCHASE,United States,0.0,CHECKING-235577-04,JPMORGANCHASE,United States,KYC-ADD-ACCOUNT-OWNER,0.0,608133.0,Mckinney-Hanna LTD 15,COMPANY-1419-6653,0,199857,Parker Rush 37,JPMC-CLIENT-3264-6924
4,2022-01-07 00:02:00,0,T-235858-04,CHECKING-235577-04,JPMORGANCHASE,United States,0.0,CHECKING-235577-04,JPMORGANCHASE,United States,KYC-ADD-ACCOUNT-OWNER,0.0,608133.0,Mckinney-Hanna LTD 15,COMPANY-1419-6653,0,981288,Jonathan Young 58,JPMC-CLIENT-1430-7697


In [14]:
joined_df['is_pep_Sender'] = joined_df['is_pep_Sender'].fillna(0).astype(int)
joined_df['customer_id_Sender'] = joined_df['customer_id_Sender'].fillna(0).astype(int)

joined_df['is_pep_Bene'] = joined_df['is_pep_Bene'].fillna(0).astype(int)
joined_df['customer_id_Bene'] = joined_df['customer_id_Bene'].fillna(0).astype(int)

joined_df = joined_df.rename(columns={'is_pep_Sender': 'Sender_Is_Pep', 'customer_id_Sender': 'Sender_Customer_Id',
                                      'is_pep_Bene':'Bene_Is_Pep','customer_id_Bene':'Bene_Customer_Id',
                                      'name_masked_Sender':'Sender_Name_Masked','USD_amount':'USD_Amount',
                                      'account_number_masked_Sender':'Sender_Account_Number_Masked','name_masked_Bene':'Bene_Name_Masked',
                                      'account_number_masked_Bene':'Bene_Account_Number_Masked'})
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1262263 entries, 0 to 1262262
Data columns (total 19 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   Time_step                     1262263 non-null  object 
 1   Label                         1262263 non-null  int64  
 2   Transaction_Id                1262263 non-null  object 
 3   Sender_Account                748161 non-null   object 
 4   Sender_Institution            748161 non-null   object 
 5   Sender_Country                748161 non-null   object 
 6   USD_Amount                    1262263 non-null  float64
 7   Bene_Account                  1262263 non-null  object 
 8   Bene_Institution              1262263 non-null  object 
 9   Bene_Country                  1262263 non-null  object 
 10  Transaction_Type              1262263 non-null  object 
 11  Sender_Is_Pep                 1262263 non-null  int64  
 12  Sender_Customer_Id          

In [15]:
joined_df.to_csv('Thesis/transactions_all.csv', index=False)


## Split Train Test Dataset

In [17]:
transactions_df = pd.read_csv('Thesis/transactions_all.csv')

# Convert the date column to datetime if it isn't already
transactions_df['Time_step'] = pd.to_datetime(transactions_df['Time_step'])

# Sort the DataFrame by the Time_step column to ensure chronological order
transactions_df = transactions_df.sort_values(by='Time_step')

# Calculate the 70% split point based on the number of rows
split_index = int(len(transactions_df) * 0.7)

# Print the date at the split index
split_date = transactions_df.iloc[split_index]['Time_step']
print(f"The date at the 70% split point is: {split_date}")

# Split the data into train and test sets based on the calculated split index
train_df = transactions_df.iloc[:split_index]
test_df = transactions_df.iloc[split_index:]

# Print the class distribution in the train and test sets
print("Class distribution in the train set:")
print(train_df['Label'].value_counts())
print("Class distribution in the test set:")
print(test_dfin['Label'].value_counts())

# Save the train and test sets to CSV files
train_df.to_csv('Thesis/train.csv', index=False)
test_df.to_csv('Thesis/test.csv', index=False)
print("Train and test sets saved to 'Thesis/' directory.")


The date at the 70% split point is: 2023-05-03 20:41:00
Class distribution in the train set:
0    632532
1    251052
Name: Label, dtype: int64
Class distribution in the test set:
0    195666
1    183013
Name: Label, dtype: int64
Train and test sets saved to 'Thesis/' directory.


## Fuzzy Rules

In [41]:
# !pip install scikit-fuzzy

Keyring is skipped due to an exception: org.freedesktop.DBus.Error.FileNotFound: Failed to connect to socket /run/user/16444/bus: No such file or directory
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-fuzzy
  Using cached scikit-fuzzy-0.4.2.tar.gz (993 kB)
Collecting networkx>=1.9.0
  Using cached networkx-2.5.1-py3-none-any.whl (1.6 MB)
Building wheels for collected packages: scikit-fuzzy
  Building wheel for scikit-fuzzy (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-fuzzy: filename=scikit_fuzzy-0.4.2-py3-none-any.whl size=894069 sha256=27377fb16f58f458f645c07b9f72f852ca7df137c9e4aafa9f8d7bb6a5693cdc
  Stored in directory: /home/echristi/.cache/pip/wheels/31/1e/58/db8cfe08f81c72d8c31bc58690ce63d9e3d93a6e97dca5ddb4
Successfully built scikit-fuzzy
Installing collected packages: networkx, scikit-fuzzy
Successfully installed networkx-2.5.1 scikit-fuzzy-0.4.2
You should consider upgrading via the '/usr/bin/python3 -m pip insta

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
import skfuzzy as fuzz
import os
from skfuzzy import control as ctrl
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import os
amount = ctrl.Antecedent(np.arange(0, 10000, 1000), 'amount')
cross_border = ctrl.Antecedent(np.arange(0, 2, 1), 'cross_border')
country_risk = ctrl.Antecedent(np.arange(0, 2, 1), 'country_risk')  # For country risk like Iran, Syria, North Korea
risk = ctrl.Consequent(np.arange(0, 101, 1), 'risk')

# Membership Functions
amount.automf(3, names=['low', 'medium', 'high'])
cross_border.automf(3, names=['domestic', 'mixed', 'international'])
country_risk['low'] = fuzz.trimf(country_risk.universe, [0, 0, 0.5])
country_risk['high'] = fuzz.trimf(country_risk.universe, [0.5, 1, 1])

risk['low'] = fuzz.trimf(risk.universe, [0, 0, 50])
risk['medium'] = fuzz.trimf(risk.universe, [20, 50, 80])
risk['high'] = fuzz.trimf(risk.universe, [60, 100, 100])

# Fuzzy rules
rule1 = ctrl.Rule(amount['high'] | cross_border['international'], risk['high'])
rule2 = ctrl.Rule(amount['medium'] & cross_border['domestic'], risk['medium'])
rule3 = ctrl.Rule(amount['low'] & (cross_border['domestic'] | cross_border['mixed']), risk['low'])
rule4 = ctrl.Rule(country_risk['high'], risk['high'])

# Control system setup
aml_control = ctrl.ControlSystem([rule1, rule2, rule3, rule4])
aml_sim = ctrl.ControlSystemSimulation(aml_control)

# Function to apply fuzzy system to each transaction
def evaluate_transaction(row):
    reasons = []


    aml_sim.inputs({
        'amount': row['USD_Amount'],
        'cross_border': 1 if row['Sender_Country'] != row['Bene_Country'] else 0,
        'country_risk': 1 if row['Bene_Country'] in ['Iran', 'Syria', 'North-Korea'] else 0
    })

    try:
        # Compute risk score
        aml_sim.compute()
        risk_score = aml_sim.output['risk']
        if risk_score >= 60:  # Assuming 60 as the threshold for high risk
            if row['USD_Amount'] >= amount.universe.max():
                reasons.append("High Amount")
            if 1 if row['Sender_Country'] != row['Bene_Country'] else 0 == 1:
                reasons.append("Cross Border Transaction")
            if 1 if row['Bene_Country'] in ['Iran', 'Syria', 'North-Korea'] else 0 == 1:
                reasons.append("High Risk Country")
        return risk_score, reasons
    except Exception as e:
        print(f"Error during risk evaluation: {e}")
        return np.nan, reasons  # Return NaN if an error occurs (no rules fired)



train_df['risk_score'], train_df['fuzzy_result'] = zip(*train_df.apply(evaluate_transaction, axis=1))
train_df['fuzzy_result'] = train_df['fuzzy_result'].apply(
    lambda x: 'None' if isinstance(x, list) and not x else x if isinstance(x, list) else str(x)
)
test_df['risk_score'], test_df['fuzzy_result'] = zip(*test_df.apply(evaluate_transaction, axis=1))
test_df['fuzzy_result'] = test_df['fuzzy_result'].apply(
    lambda x: 'None' if isinstance(x, list) and not x else x if isinstance(x, list) else str(x)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return array(a, dtype, copy=False, order=order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:

In [34]:
train_df.to_csv('Thesis/train_with_fuzzy_results.csv', index=False)
test_df.to_csv('Thesis/test_with_fuzzy_results.csv', index=False)


## Testing 1 GCN+LSTM

In [None]:
# !pip install torch
# !pip install torch-geometric
# !pip install torch-sparse
# !pip install torch-scatter


In [None]:
# !pip install optuna

Keyring is skipped due to an exception: org.freedesktop.DBus.Error.FileNotFound: Failed to connect to socket /run/user/16444/bus: No such file or directory
Defaulting to user installation because normal site-packages is not writeable
Collecting optuna
  Downloading optuna-3.0.6-py3-none-any.whl (348 kB)
[K     |################################| 348 kB 759 bytes/s  0:00:01
[?25hCollecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |################################| 81 kB 14 kB/s s eta 0:00:01
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-1.4.53-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |################################| 1.6 MB 10 kB/s s eta 0:00:01
Collecting alembic>=1.5.0
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
[K     |################################| 210 kB 9.6 kB/s  eta 0:00:01
Collecting colorlog
  Using cached colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting c

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EdgeGCN_LSTM(nn.Module):
    def __init__(self, hidden_channels, lstm_hidden_channels, out_channels, dropout_rate):
        super(EdgeGCN_LSTM, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.lstm = nn.LSTM(input_size=hidden_channels * 2 + 4, hidden_size=lstm_hidden_channels, batch_first=True)
        self.lin = nn.Linear(lstm_hidden_channels, out_channels)
        self.dropout_rate = dropout_rate
    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(F.relu(self.bn1(self.conv1(x, edge_index))), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.bn2(self.conv2(x, edge_index))), p=self.dropout_rate, training=self.training)
        sender_features = x[edge_index[0]]
        receiver_features = x[edge_index[1]]
        edge_features = torch.cat([sender_features, receiver_features, edge_attr], dim=1)
        edge_features = edge_features.unsqueeze(0)  # Add batch dimension for LSTM
        lstm_out, _ = self.lstm(edge_features)
        lstm_out = lstm_out.squeeze(0)  # Remove batch dimension
        out = self.lin(lstm_out)
        return out.view(-1)

class GraphDataProcessor:
    def __init__(self, df):
        self.df = df

    def undersample_df(self):
        fraud_df = self.df[self.df['Label'] == 1]
        non_fraud_df = self.df[self.df['Label'] == 0]
        print(f"Initial fraud cases: {len(fraud_df)}, non-fraud cases: {len(non_fraud_df)}")
        balanced_df = non_fraud_df.sample(len(fraud_df), random_state=42)
        self.df = pd.concat([fraud_df, balanced_df])
        print(f"Balanced dataset: {len(self.df)} records")

    def prepare_graph_data(self):
        self.undersample_df()
        self.df['Time_step'] = pd.to_datetime(self.df['Time_step'])
        self.df = self.df.sort_values(by=['Sender_Customer_Id', 'Time_step'])
        self.df['Label'] = pd.to_numeric(self.df['Label'], errors='coerce').fillna(0).astype(int)
        self.df['Days_Since_Last'] = self.df.groupby('Sender_Customer_Id')['Time_step'].diff().dt.days.fillna(0)

        all_ids = pd.concat([self.df['Sender_Customer_Id'], self.df['Bene_Customer_Id']]).unique()
        if len(all_ids) == 0:
            raise ValueError("No unique IDs found in the dataset")

        id_map = {id: idx for idx, id in enumerate(all_ids)}
        edge_index = torch.tensor([self.df['Sender_Customer_Id'].map(id_map).values, self.df['Bene_Customer_Id'].map(id_map).values], dtype=torch.long)

        node_features = torch.zeros((len(all_ids), 1))
        if self.df['Transaction_Type'].isnull().any() or self.df['USD_Amount'].isnull().any() or self.df['risk_score'].isnull().any():
            raise ValueError("Null values found in essential columns")

        transaction_type_encoded = torch.tensor(LabelEncoder().fit_transform(self.df['Transaction_Type']), dtype=torch.float).view(-1, 1)
        usd_amount = torch.tensor(StandardScaler().fit_transform(self.df[['USD_Amount']]), dtype=torch.float).view(-1, 1)
        risk_score = torch.tensor(self.df['risk_score'].values, dtype=torch.float).view(-1, 1)
        days_since_last = torch.tensor(StandardScaler().fit_transform(self.df[['Days_Since_Last']]), dtype=torch.float).view(-1, 1)

        edge_attr = torch.cat([transaction_type_encoded, usd_amount, risk_score, days_since_last], dim=1)
        edge_labels = torch.tensor(self.df['Label'].values, dtype=torch.long)

        return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)

# Split the dataset into train, validation, and test sets
train_val_df, test_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['Label']
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,
    random_state=42,
    stratify=train_val_df['Label']
)

train_processor = GraphDataProcessor(train_df)
val_processor = GraphDataProcessor(val_df)
test_processor = GraphDataProcessor(test_df)

train_data = train_processor.prepare_graph_data()
val_data = val_processor.prepare_graph_data()
test_data = test_processor.prepare_graph_data()

train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
val_loader = DataLoader([val_data], batch_size=32, shuffle=False)
test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(output, data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader, criterion):
    model.eval()
    y_true, y_pred, y_scores = [], [], []
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.edge_attr)
            loss = criterion(output, data.y.float())
            total_loss += loss.item()

            probs = torch.sigmoid(output).cpu().numpy()
            preds = (probs > 0.5).astype(int)

            y_scores.extend(probs)
            y_pred.extend(preds)
            y_true.extend(data.y.cpu().numpy())

    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    ks_statistic = max(tpr - fpr)

    return total_loss / len(loader), f1, precision, recall, auc, ks_statistic

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64])
    lstm_hidden_channels = trial.suggest_categorical('lstm_hidden_channels', [16, 32, 64])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)

    model = EdgeGCN_LSTM(hidden_channels=hidden_channels, lstm_hidden_channels=lstm_hidden_channels, out_channels=1, dropout_rate=dropout_rate).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    best_f1 = 0
    best_model_path = "gcn_lstm.pth"
    for epoch in range(10):
        train_loss = train(model, device, train_loader, optimizer, criterion)
        val_loss, f1, precision, recall, auc, ks_statistic = evaluate(model, device, val_loader, criterion)
        if f1 > best_f1:
            best_f1 = f1
            # Save both model state and hyperparameters
            checkpoint = {
                'state_dict': model.state_dict(),
                'hyperparameters': {
                    'hidden_channels': hidden_channels,
                    'lstm_hidden_channels': lstm_hidden_channels,
                    'out_channels': 1,
                    'dropout_rate': dropout_rate
                },
                'metrics': {
                    'f1': f1,
                    'precision': precision,
                    'recall': recall,
                    'auc': auc,
                    'ks_statistic': ks_statistic
                }
            }
            torch.save(checkpoint, best_model_path)
            shutil.copy(best_model_path, f"/content/drive/My Drive/{best_model_path}")

    return best_f1  # Optimize for the best F1 score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f" Value (F1 Score): {trial.value}")
print(" Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Load the best model to print all metrics
checkpoint = torch.load(f"/content/drive/My Drive/gcn_lstm.pth")
metrics = checkpoint['metrics']
print(" Validation set metrics:")
print(f"    F1 Score: {metrics['f1']}")
print(f"    Precision: {metrics['precision']}")
print(f"    Recall: {metrics['recall']}")
print(f"    AUC: {metrics['auc']}")
print(f"    KS Statistic: {metrics['ks_statistic']}")

# Evaluate on test set
model = EdgeGCN_LSTM(
    hidden_channels=checkpoint['hyperparameters']['hidden_channels'],
    lstm_hidden_channels=checkpoint['hyperparameters']['lstm_hidden_channels'],
    out_channels=1,
    dropout_rate=checkpoint['hyperparameters']['dropout_rate']
).to(device)
model.load_state_dict(checkpoint['state_dict'])

criterion = nn.BCEWithLogitsLoss()
test_loss, test_f1, test_precision, test_recall, test_auc, test_ks_statistic = evaluate(model, device, test_loader, criterion)

print(" Test set metrics:")
print(f"    Loss: {test_loss}")
print(f"    F1 Score: {test_f1}")
print(f"    Precision: {test_precision}")
print(f"    Recall: {test_recall}")
print(f"    AUC: {test_auc}")
print(f"    KS Statistic: {test_ks_statistic}")


## Testing 2 GCN+GRU

In [7]:
train_df = pd.read_csv('/content/drive/My Drive/Thesis/train_with_fuzzy_results.csv')
test_df = pd.read_csv('/content/drive/My Drive/Thesis/test_with_fuzzy_results.csv')


In [None]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
import shutil

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class GraphDataProcessor:
    def __init__(self, df):
        self.df = df

    def undersample_df(self):
        fraud_df = self.df[self.df['Label'] == 1]
        non_fraud_df = self.df[self.df['Label'] == 0]
        print(f"Initial fraud cases: {len(fraud_df)}, non-fraud cases: {len(non_fraud_df)}")
        balanced_df = non_fraud_df.sample(len(fraud_df), random_state=42)
        self.df = pd.concat([fraud_df, balanced_df])
        print(f"Balanced dataset: {len(self.df)} records")

    def prepare_graph_data(self):
        self.undersample_df()
        self.df['Time_step'] = pd.to_datetime(self.df['Time_step'])
        self.df = self.df.sort_values(by=['Sender_Customer_Id', 'Time_step'])
        self.df['Label'] = pd.to_numeric(self.df['Label'], errors='coerce').fillna(0).astype(int)
        self.df['Days_Since_Last'] = self.df.groupby('Sender_Customer_Id')['Time_step'].diff().dt.days.fillna(0)

        all_ids = pd.concat([self.df['Sender_Customer_Id'], self.df['Bene_Customer_Id']]).unique()
        if len(all_ids) == 0:
            raise ValueError("No unique IDs found in the dataset")

        id_map = {id: idx for idx, id in enumerate(all_ids)}
        edge_index = torch.tensor([self.df['Sender_Customer_Id'].map(id_map).values, self.df['Bene_Customer_Id'].map(id_map).values], dtype=torch.long)

        node_features = torch.zeros((len(all_ids), 1))
        if self.df['Transaction_Type'].isnull().any() or self.df['USD_Amount'].isnull().any() or self.df['risk_score'].isnull().any():
            raise ValueError("Null values found in essential columns")

        transaction_type_encoded = torch.tensor(LabelEncoder().fit_transform(self.df['Transaction_Type']), dtype=torch.float).view(-1, 1)
        usd_amount = torch.tensor(StandardScaler().fit_transform(self.df[['USD_Amount']]), dtype=torch.float).view(-1, 1)
        risk_score = torch.tensor(self.df['risk_score'].values, dtype=torch.float).view(-1, 1)
        days_since_last = torch.tensor(StandardScaler().fit_transform(self.df[['Days_Since_Last']]), dtype=torch.float).view(-1, 1)

        edge_attr = torch.cat([transaction_type_encoded, usd_amount, risk_score, days_since_last], dim=1)
        edge_labels = torch.tensor(self.df['Label'].values, dtype=torch.long)

        return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)

class EdgeGCN_GRU(nn.Module):
    def __init__(self, hidden_channels, gru_hidden_channels, out_channels, dropout_rate):
        super(EdgeGCN_GRU, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.gru = nn.GRU(input_size=hidden_channels * 2 + 4, hidden_size=gru_hidden_channels, batch_first=True)
        self.lin = nn.Linear(gru_hidden_channels, out_channels)
        self.dropout_rate = dropout_rate

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(F.relu(self.bn1(self.conv1(x, edge_index))), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.bn2(self.conv2(x, edge_index))), p=self.dropout_rate, training=self.training)
        sender_features = x[edge_index[0]]
        receiver_features = x[edge_index[1]]
        edge_features = torch.cat([sender_features, receiver_features, edge_attr], dim=1)
        edge_features = edge_features.unsqueeze(0)  # Add batch dimension for GRU
        gru_out, _ = self.gru(edge_features)
        gru_out = gru_out.squeeze(0)  # Remove batch dimension
        out = self.lin(gru_out)
        return out.view(-1)
def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(output, data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader, criterion):
    model.eval()
    y_true, y_pred, y_scores = [], [], []
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.edge_attr)
            loss = criterion(output, data.y.float())
            total_loss += loss.item()

            probs = torch.sigmoid(output).cpu().numpy()
            preds = (probs > 0.5).astype(int)

            y_scores.extend(probs)
            y_pred.extend(preds)
            y_true.extend(data.y.cpu().numpy())

    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    ks_statistic = max(tpr - fpr)

    return total_loss / len(loader), f1, precision, recall, auc, ks_statistic
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64])
    gru_hidden_channels = trial.suggest_categorical('gru_hidden_channels', [16, 32, 64])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)

    model = EdgeGCN_GRU(hidden_channels=hidden_channels, gru_hidden_channels=gru_hidden_channels, out_channels=1, dropout_rate=dropout_rate).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    best_f1 = 0
    best_model_path = "gcn_gru.pth"
    for epoch in range(10):
        train_loss = train(model, device, train_loader, optimizer, criterion)
        val_loss, f1, precision, recall, auc, ks_statistic = evaluate(model, device, val_loader, criterion)
        if f1 > best_f1:
            best_f1 = f1
            # Save both model state and hyperparameters
            checkpoint = {
                'state_dict': model.state_dict(),
                'hyperparameters': {
                    'hidden_channels': hidden_channels,
                    'gru_hidden_channels': gru_hidden_channels,
                    'out_channels': 1,
                    'dropout_rate': dropout_rate
                },
                'metrics': {
                    'f1': f1,
                    'precision': precision,
                    'recall': recall,
                    'auc': auc,
                    'ks_statistic': ks_statistic
                }
            }
            torch.save(checkpoint, best_model_path)
            shutil.copy(best_model_path, f"/content/drive/My Drive/{best_model_path}")

    return best_f1  # Optimize for the best F1 score


train_val_df, test_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['Label']
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,
    random_state=42,
    stratify=train_val_df['Label']
)

train_processor = GraphDataProcessor(train_df)
val_processor = GraphDataProcessor(val_df)
test_processor = GraphDataProcessor(test_df)

train_data = train_processor.prepare_graph_data()
val_data = val_processor.prepare_graph_data()
test_data = test_processor.prepare_graph_data()

train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
val_loader = DataLoader([val_data], batch_size=32, shuffle=False)
test_loader = DataLoader([test_data], batch_size=32, shuffle=False)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print("Best trial:")
trial = study.best_trial
print(f" Value (F1 Score): {trial.value}")
print(" Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Load the best model to print all metrics
checkpoint = torch.load(f"/content/drive/My Drive/gcn_gru.pth")
metrics = checkpoint['metrics']
print(" Validation set metrics:")
print(f"    F1 Score: {metrics['f1']}")
print(f"    Precision: {metrics['precision']}")
print(f"    Recall: {metrics['recall']}")
print(f"    AUC: {metrics['auc']}")
print(f"    KS Statistic: {metrics['ks_statistic']}")
model = EdgeGCN_GRU(
    hidden_channels=checkpoint['hyperparameters']['hidden_channels'],
    gru_hidden_channels=checkpoint['hyperparameters']['gru_hidden_channels'],
    out_channels=1,
    dropout_rate=checkpoint['hyperparameters']['dropout_rate']
).to(device)
model.load_state_dict(checkpoint['state_dict'])

criterion = nn.BCEWithLogitsLoss()
test_loss, test_f1, test_precision, test_recall, test_auc, test_ks_statistic = evaluate(model, device, test_loader, criterion)

print(" Test set metrics:")
print(f"    Loss: {test_loss}")
print(f"    F1 Score: {test_f1}")
print(f"    Precision: {test_precision}")
print(f"    Recall: {test_recall}")
print(f"    AUC: {test_auc}")
print(f"    KS Statistic: {test_ks_statistic}")


Initial fraud cases: 53343, non-fraud cases: 203323
Balanced dataset: 106686 records


  edge_index = torch.tensor([self.df['Sender_Customer_Id'].map(id_map).values, self.df['Bene_Customer_Id'].map(id_map).values], dtype=torch.long)


Initial fraud cases: 17781, non-fraud cases: 67775
Balanced dataset: 35562 records
Initial fraud cases: 17781, non-fraud cases: 67775
Balanced dataset: 35562 records


[I 2024-08-06 10:19:26,767] A new study created in memory with name: no-name-02d53bf9-977a-4234-a44f-4930786e05ce
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-08-06 10:25:43,160] Trial 0 finished with value: 0.7127356489058617 and parameters: {'lr': 0.03182292588617258, 'hidden_channels': 32, 'gru_hidden_channels': 64, 'dropout_rate': 0.12964183950169697}. Best is trial 0 with value: 0.7127356489058617.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-08-06 10:32:01,929] Trial 1 finished with value: 0.7035024479474902 and parameters: {'lr': 0.042479005419784444, 'hidden_channels': 64, 'gru_hidden_channels': 64, 'dropout_rate': 0.4742345879102148}. Best is trial 0 with value:

## Isolation Forest

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
import numpy as np

class DataProcessor:
    def __init__(self, df):
        self.df = df

    def prepare_data(self):
        self.df['Time_step'] = pd.to_datetime(self.df['Time_step'])
        self.df = self.df.sort_values(by=['Sender_Customer_Id', 'Time_step'])
        self.df['Label'] = pd.to_numeric(self.df['Label'], errors='coerce').fillna(0).astype(int)
        self.df['Days_Since_Last'] = self.df.groupby('Sender_Customer_Id')['Time_step'].diff().dt.days.fillna(0)

        features = ['Transaction_Type', 'USD_Amount', 'risk_score', 'Days_Since_Last']
        self.df = self.df[features + ['Label']]

        le = LabelEncoder()
        self.df['Transaction_Type'] = le.fit_transform(self.df['Transaction_Type'])

        scaler = StandardScaler()
        self.df[['USD_Amount', 'risk_score', 'Days_Since_Last']] = scaler.fit_transform(self.df[['USD_Amount', 'risk_score', 'Days_Since_Last']])

        X = self.df[features].values
        y = self.df['Label'].values

        return X, y

def train_and_evaluate(X_train, y_train, X_val, y_val):
    model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    model.fit(X_train)

    # Predict anomalies on validation set
    y_scores = -model.decision_function(X_val)
    y_pred = model.predict(X_val)
    y_pred = np.where(y_pred == 1, 0, 1)  # Convert from {1, -1} to {0, 1}

    # Evaluation metrics
    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_scores)
    fpr, tpr, thresholds = roc_curve(y_val, y_scores)
    ks_statistic = max(tpr - fpr)

    return f1, precision, recall, auc, ks_statistic, model

def test_model(model, X_test, y_test):
    y_scores = -model.decision_function(X_test)
    y_pred = model.predict(X_test)
    y_pred = np.where(y_pred == 1, 0, 1)  # Convert from {1, -1} to {0, 1}

    # Evaluation metrics
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_scores)
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    ks_statistic = max(tpr - fpr)

    return f1, precision, recall, auc, ks_statistic

# Load your data
# Assuming train_df, val_df, and test_df are already loaded DataFrames

# Data processing
train_processor = DataProcessor(train_df)
val_processor = DataProcessor(val_df)
test_processor = DataProcessor(test_df)

X_train, y_train = train_processor.prepare_data()
X_val, y_val = val_processor.prepare_data()
X_test, y_test = test_processor.prepare_data()

# Train and evaluate the model on validation set
f1, precision, recall, auc, ks_statistic, trained_model = train_and_evaluate(X_train, y_train, X_val, y_val)

# Print the validation metrics
print(" Validation set metrics:")
print(f"    F1 Score: {f1}")
print(f"    Precision: {precision}")
print(f"    Recall: {recall}")
print(f"    AUC: {auc}")
print(f"    KS Statistic: {ks_statistic}")

# Test the model on the test set
test_f1, test_precision, test_recall, test_auc, test_ks_statistic = test_model(trained_model, X_test, y_test)

# Print the test metrics
print(" Test set metrics:")
print(f"    F1 Score: {test_f1}")
print(f"    Precision: {test_precision}")
print(f"    Recall: {test_recall}")
print(f"    AUC: {test_auc}")
print(f"    KS Statistic: {test_ks_statistic}")


## GAT with GRU

In [None]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
import shutil

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphDataProcessor:
    def __init__(self, df):
        self.df = df

    def undersample_df(self):
        fraud_df = self.df[self.df['Label'] == 1]
        non_fraud_df = self.df[self.df['Label'] == 0]
        print(f"Initial fraud cases: {len(fraud_df)}, non-fraud cases: {len(non_fraud_df)}")
        balanced_df = non_fraud_df.sample(len(fraud_df), random_state=42)
        self.df = pd.concat([fraud_df, balanced_df])
        print(f"Balanced dataset: {len(self.df)} records")

    def prepare_graph_data(self):
        self.undersample_df()
        self.df['Time_step'] = pd.to_datetime(self.df['Time_step'])
        self.df = self.df.sort_values(by=['Sender_Customer_Id', 'Time_step'])
        self.df['Label'] = pd.to_numeric(self.df['Label'], errors='coerce').fillna(0).astype(int)
        self.df['Days_Since_Last'] = self.df.groupby('Sender_Customer_Id')['Time_step'].diff().dt.days.fillna(0)

        all_ids = pd.concat([self.df['Sender_Customer_Id'], self.df['Bene_Customer_Id']]).unique()
        if len(all_ids) == 0:
            raise ValueError("No unique IDs found in the dataset")

        id_map = {id: idx for idx, id in enumerate(all_ids)}
        edge_index = torch.tensor([self.df['Sender_Customer_Id'].map(id_map).values, self.df['Bene_Customer_Id'].map(id_map).values], dtype=torch.long)

        node_features = torch.zeros((len(all_ids), 1))
        if self.df['Transaction_Type'].isnull().any() or self.df['USD_Amount'].isnull().any() or self.df['risk_score'].isnull().any():
            raise ValueError("Null values found in essential columns")

        transaction_type_encoded = torch.tensor(LabelEncoder().fit_transform(self.df['Transaction_Type']), dtype=torch.float).view(-1, 1)
        usd_amount = torch.tensor(StandardScaler().fit_transform(self.df[['USD_Amount']]), dtype=torch.float).view(-1, 1)
        risk_score = torch.tensor(self.df['risk_score'].values, dtype=torch.float).view(-1, 1)
        days_since_last = torch.tensor(StandardScaler().fit_transform(self.df[['Days_Since_Last']]), dtype=torch.float).view(-1, 1)

        edge_attr = torch.cat([transaction_type_encoded, usd_amount, risk_score, days_since_last], dim=1)
        edge_labels = torch.tensor(self.df['Label'].values, dtype=torch.long)

        return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)

class EdgeGAT_GRU(nn.Module):
    def __init__(self, hidden_channels, gru_hidden_channels, out_channels, dropout_rate, heads=1):
        super(EdgeGAT_GRU, self).__init__()
        self.conv1 = GATConv(1, hidden_channels, heads=heads)
        self.bn1 = nn.BatchNorm1d(hidden_channels * heads)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads)
        self.bn2 = nn.BatchNorm1d(hidden_channels * heads)
        self.gru = nn.GRU(input_size=hidden_channels * 2 * heads + 4, hidden_size=gru_hidden_channels, batch_first=True)
        self.lin = nn.Linear(gru_hidden_channels, out_channels)
        self.dropout_rate = dropout_rate

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(F.relu(self.bn1(self.conv1(x, edge_index))), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.bn2(self.conv2(x, edge_index))), p=self.dropout_rate, training=self.training)
        sender_features = x[edge_index[0]]
        receiver_features = x[edge_index[1]]
        edge_features = torch.cat([sender_features, receiver_features, edge_attr], dim=1)
        edge_features = edge_features.unsqueeze(0)  # Add batch dimension for GRU
        gru_out, _ = self.gru(edge_features)
        gru_out = gru_out.squeeze(0)
        out = self.lin(gru_out)
        return out.view(-1)

def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(output, data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader, criterion):
    model.eval()
    y_true, y_pred, y_scores = [], [], []
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.edge_attr)
            loss = criterion(output, data.y.float())
            total_loss += loss.item()

            probs = torch.sigmoid(output).cpu().numpy()
            preds = (probs > 0.5).astype(int)

            y_scores.extend(probs)
            y_pred.extend(preds)
            y_true.extend(data.y.cpu().numpy())

    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    ks_statistic = max(tpr - fpr)

    return total_loss / len(loader), f1, precision, recall, auc, ks_statistic

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64])
    gru_hidden_channels = trial.suggest_categorical('gru_hidden_channels', [16, 32, 64])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)
    heads = trial.suggest_int('heads', 1, 4)

    model = EdgeGAT_GRU(hidden_channels=hidden_channels, gru_hidden_channels=gru_hidden_channels, out_channels=1, dropout_rate=dropout_rate, heads=heads).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    best_f1 = 0
    best_model_path = "gat_gru.pth"
    for epoch in range(10):
        train_loss = train(model, device, train_loader, optimizer, criterion)
        val_loss, f1, precision, recall, auc, ks_statistic = evaluate(model, device, val_loader, criterion)
        if f1 > best_f1:
            best_f1 = f1
            # Save both model state and hyperparameters
            checkpoint = {
                'state_dict': model.state_dict(),
                'hyperparameters': {
                    'hidden_channels': hidden_channels,
                    'gru_hidden_channels': gru_hidden_channels,
                    'out_channels': 1,
                    'dropout_rate': dropout_rate,
                    'heads': heads
                },
                'metrics': {
                    'f1': f1,
                    'precision': precision,
                    'recall': recall,
                    'auc': auc,
                    'ks_statistic': ks_statistic
                }
            }
            torch.save(checkpoint, best_model_path)
            shutil.copy(best_model_path, f"/content/drive/My Drive/{best_model_path}")

    return best_f1  # Optimize for the best F1 score

# Split data into training, validation, and test sets
train_val_df, test_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['Label']
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,
    random_state=42,
    stratify=train_val_df['Label']
)

train_processor = GraphDataProcessor(train_df)
val_processor = GraphDataProcessor(val_df)
test_processor = GraphDataProcessor(test_df)

train_data = train_processor.prepare_graph_data()
val_data = val_processor.prepare_graph_data()
test_data = test_processor.prepare_graph_data()

train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
val_loader = DataLoader([val_data], batch_size=32, shuffle=False)
test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

# Optuna study for hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f" Value (F1 Score): {trial.value}")
print(" Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Load the best model to print all metrics
checkpoint = torch.load(f"/content/drive/My Drive/gat_gru.pth")
metrics = checkpoint['metrics']
print(" Validation set metrics:")
print(f"    F1 Score: {metrics['f1']}")
print(f"    Precision: {metrics['precision']}")
print(f"    Recall: {metrics['recall']}")
print(f"    AUC: {metrics['auc']}")
print(f"    KS Statistic: {metrics['ks_statistic']}")

model = EdgeGAT_GRU(
    hidden_channels=checkpoint['hyperparameters']['hidden_channels'],
    gru_hidden_channels=checkpoint['hyperparameters']['gru_hidden_channels'],
    out_channels=1,
    dropout_rate=checkpoint['hyperparameters']['dropout_rate'],
    heads=checkpoint['hyperparameters']['heads']
).to(device)
model.load_state_dict(checkpoint['state_dict'])

criterion = nn.BCEWithLogitsLoss()
test_loss, test_f1, test_precision, test_recall, test_auc, test_ks_statistic = evaluate(model, device, test_loader, criterion)

print(" Test set metrics:")
print(f"    Loss: {test_loss}")
print(f"    F1 Score: {test_f1}")
print(f"    Precision: {test_precision}")
print(f"    Recall: {test_recall}")
print(f"    AUC: {test_auc}")
print(f"    KS Statistic: {test_ks_statistic}")
