In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Stable Code) ---")
print("Objective: Identify Anomalous Accounts based on their behavior.")

# --- Load Data (Make sure this name matches the file you uploaded) ---
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    print("!!! Make sure the file name in the code (file_path) matches the file in the folder exactly !!!")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]

df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)

df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Build Behavioral Profiles (Strong Features) ---
print("\n--- Step 2: Building Behavioral Profiles ---")

# (a) Calculate total received, total cashed out, and unique senders count
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()

df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)

    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0) # The ratio cannot exceed 100%

    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })

final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 4: Train an Unsupervised Model ---
print("\n--- Step 4: Training Isolation Forest Model ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest', # Recipient's cash-out ratio
    'dest_unique_senders_dest', # Recipient's unique senders
    'dest_cash_out_ratio_orig', # Sender's cash-out ratio
    'dest_unique_senders_orig'  # Sender's unique senders
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer"

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

contamination = y_true.mean()
print(f"Fraud (Contamination) rate in the Smart Sample: {contamination:.2%}")

model = IsolationForest(contamination=contamination, random_state=42)
model.fit(X_scaled)
predictions = model.predict(X_scaled) # -1 = anomalous, 1 = normal
print("Model training complete.")

# --- Step 5: Answer Your Question ---
print("\n--- Step 5: Identifying Anomalous Accounts ---")

# Add the model's "guess" to the data
# 1 = anomalous, 0 = normal
df_model_data['anomaly_prediction'] = [1 if p == -1 else 0 for p in predictions]

# Find the "transactions" that the model judged as anomalous
anomalous_transactions = df_model_data[df_model_data['anomaly_prediction'] == 1]

# Find the "account names" (senders and recipients) involved in these anomalous transactions
anomalous_dest_accounts = anomalous_transactions['nameDest'].unique()
anomalous_orig_accounts = anomalous_transactions['nameOrig'].unique()

all_anomalous_accounts = np.union1d(anomalous_dest_accounts, anomalous_orig_accounts)

print(f"\n[Final Result]: The model found {len(all_anomalous_accounts)} 'anomalous' accounts.")

# Print a sample of 20 accounts the model considered 'anomalous'
print("Sample of accounts the model considered 'anomalous':")
print(all_anomalous_accounts[:20])

# --- For Comparison: What are the "Real" Fraudulent Accounts? ---
print("\n--- For Comparison (The Truth) ---")
print(f"The number of 'real' fraudulent accounts (isFraud=1) was: {len(all_fraud_user_ids)}")
print("Sample of 'real' fraudulent accounts:")
print(all_fraud_user_ids[:20])

# Calculate F1-Score to verify quality
f1 = f1_score(y_true, df_model_data['anomaly_prediction'])
print(f"\nThe model's F1-Score (for confirmation): {f1:.2%}")

print("\n--- Stable Code Execution Complete ---")

--- Execution Started (Stable Code) ---
Objective: Identify Anomalous Accounts based on their behavior.
Successfully loaded the full file (83561 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 83560 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Training Isolation Forest Model ---
Fraud (Contamination) rate in the Smart Sample: 0.13%
Model training complete.

--- Step 5: Identifying Anomalous Accounts ---

[Final Result]: The model found 149 'anomalous' accounts.
Sample of accounts the model considered 'anomalous':
['C1017653240' 'C1023714065' 'C1032959800' 'C1041381648' 'C1047512213'
 'C1057507014' 'C106297322' 'C1076035261' 'C1090678364' 'C1141137903'
 'C1156269888' 'C1171007108' 'C1189372023' 'C12139181' 'C1219553025'
 'C1220897602' 'C1225081767' 'C1234776885' 'C1238013097' 'C1240696061']

--- For Comparison (The Truth) ---
The

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Stable Code) ---")
print("Objective: Identify Anomalous Accounts based on their behavior.")

# --- Load Data (Make sure this name matches the file you uploaded) ---
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    print("!!! Make sure the file name in the code (file_path) matches the file in the folder exactly !!!")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]

df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)

df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Build Behavioral Profiles (Strong Features) ---
print("\n--- Step 2: Building Behavioral Profiles ---")

# (a) Calculate total received, total cashed out, and unique senders count
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()

df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)

    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0) # The ratio cannot exceed 100%

    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })

final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 4: Train an Unsupervised Model ---
print("\n--- Step 4: Training Isolation Forest Model ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest', # Recipient's cash-out ratio
    'dest_unique_senders_dest', # Recipient's unique senders
    'dest_cash_out_ratio_orig', # Sender's cash-out ratio
    'dest_unique_senders_orig'  # Sender's unique senders
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer"

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

contamination = y_true.mean()
print(f"Fraud (Contamination) rate in the Smart Sample: {contamination:.2%}")

model = IsolationForest(contamination=contamination, random_state=42)
model.fit(X_scaled)
predictions = model.predict(X_scaled) # -1 = anomalous, 1 = normal
print("Model training complete.")

# --- Step 5: Answer Your Question ---
print("\n--- Step 5: Identifying Anomalous Accounts ---")

# Add the model's "guess" to the data
# 1 = anomalous, 0 = normal
df_model_data['anomaly_prediction'] = [1 if p == -1 else 0 for p in predictions]

# Find the "transactions" that the model judged as anomalous
anomalous_transactions = df_model_data[df_model_data['anomaly_prediction'] == 1]

# Find the "account names" (senders and recipients) involved in these anomalous transactions
anomalous_dest_accounts = anomalous_transactions['nameDest'].unique()
anomalous_orig_accounts = anomalous_transactions['nameOrig'].unique()

all_anomalous_accounts = np.union1d(anomalous_dest_accounts, anomalous_orig_accounts)

print(f"\n[Final Result]: The model found {len(all_anomalous_accounts)} 'anomalous' accounts.")

# Print a sample of 20 accounts the model considered 'anomalous'
print("Sample of accounts the model considered 'anomalous':")
print(all_anomalous_accounts[:20])

# --- For Comparison: What are the "Real" Fraudulent Accounts? ---
print("\n--- For Comparison (The Truth) ---")
print(f"The number of 'real' fraudulent accounts (isFraud=1) was: {len(all_fraud_user_ids)}")
print("Sample of 'real' fraudulent accounts:")
print(all_fraud_user_ids[:20])

# Calculate F1-Score to verify quality
f1 = f1_score(y_true, df_model_data['anomaly_prediction'])
print(f"\nThe model's F1-Score (for confirmation): {f1:.2%}")

print("\n--- Stable Code Execution Complete ---")

--- Execution Started (Stable Code) ---
Objective: Identify Anomalous Accounts based on their behavior.
Successfully loaded the full file (6362620 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 561154 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Training Isolation Forest Model ---
Fraud (Contamination) rate in the Smart Sample: 1.46%
Model training complete.

--- Step 5: Identifying Anomalous Accounts ---

[Final Result]: The model found 11376 'anomalous' accounts.
Sample of accounts the model considered 'anomalous':
['C1000156006' 'C1000484178' 'C1000628778' 'C1000868784' 'C1001249070'
 'C1001476563' 'C1001658373' 'C1001742979' 'C1001844551' 'C1002031672'
 'C1002055980' 'C1002446735' 'C1002469873' 'C1003024596' 'C1003280328'
 'C1003526443' 'C1003663195' 'C100367356' 'C1003909463' 'C100394411']

--- For Comparison (The Truth) -

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# --- (Fix: Imported the correct model name) ---
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Smartest Code: With 'Time' Feature) ---")
print("Objective: Improve F1-Score by adding 'avg_time_to_cash_out'.")

# --- Load Data ---
# Make sure this name matches the file you uploaded
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)
df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Building Behavioral Profiles (Including 'Time') ---
print("\n--- Step 2: Building Behavioral Profiles (Full) ---")

# (a) Calculate basic stats (Ratio, Sender Count)
print("   (a) Calculating cash-out ratio and unique senders...")
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)

# (b) Calculate average time to cash-out (the smartest feature using 'step')
print("   (b) Calculating average time to cash-out (avg_time_to_cash_out)...")
df_transfers = df_smart_sample[df_smart_sample['type'] == 'TRANSFER'][['step', 'nameDest']]
df_cashouts = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT'][['step', 'nameOrig']]
df_transfers.rename(columns={'nameDest': 'user_id'}, inplace=True)
df_cashouts.rename(columns={'nameOrig': 'user_id'}, inplace=True)
df_transfers['tx_type'] = 'TRANSFER_IN'
df_cashouts['tx_type'] = 'CASH_OUT'

user_log = pd.concat([df_transfers, df_cashouts]).sort_values(by=['user_id', 'step'])
user_log['prev_step'] = user_log.groupby('user_id')['step'].shift(1)
user_log['prev_type'] = user_log.groupby('user_id')['tx_type'].shift(1)

# (Correcting the previous error)
user_log['time_since_transfer'] = user_log['step'] - user_log['prev_step']
is_pattern = (user_log['tx_type'] == 'CASH_OUT') & (user_log['prev_type'] == 'TRANSFER_IN')
pattern_times = user_log[is_pattern]

avg_time_profile = pattern_times.groupby('user_id')['time_since_transfer'].mean().reset_index()
avg_time_profile.columns = ['user_id', 'avg_time_to_cash_out']

# (c) Aggregate final profiles
print("   (c) Aggregating final profiles...")
final_profiles = pd.merge(final_profiles, avg_time_profile, on='user_id', how='left')
# Fill NaNs: If the user didn't follow the pattern, set a long time (e.g., 999)
final_profiles['avg_time_to_cash_out'] = final_profiles['avg_time_to_cash_out'].fillna(999)
print("Complex behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

# Fill NaNs from the merge
df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['avg_time_to_cash_out_dest'] = df_model_data['avg_time_to_cash_out_dest'].fillna(999)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)
df_model_data['avg_time_to_cash_out_orig'] = df_model_data['avg_time_to_cash_out_orig'].fillna(999)

# --- Step 4: Train Unsupervised Model ---
print("\n--- Step 4: Training Isolation Forest Model (With New Features) ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'avg_time_to_cash_out_dest', # <-- New feature (for recipient)
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig',
    'avg_time_to_cash_out_orig'  # <-- New feature (for sender)
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer" (for evaluation only)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

contamination = y_true.mean()
print(f"Fraud (Contamination) rate in the Smart Sample: {contamination:.2%}")

# --- (This is the corrected line) ---
model = IsolationForest(contamination=contamination, random_state=42)
model.fit(X_scaled)
predictions = model.predict(X_scaled) # -1 = anomalous, 1 = normal
print("Model training complete.")

# --- Step 5: Evaluate the Model (Comparison) ---
print("\n--- Step 5: Evaluating the Model (New Score) ---")

y_pred = [1 if p == -1 else 0 for p in predictions]
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (New Final Score): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)

print("\n--- Smartest Code Execution Complete ---")

--- Execution Started (Smartest Code: With 'Time' Feature) ---
Objective: Improve F1-Score by adding 'avg_time_to_cash_out'.
Successfully loaded the full file (192593 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 192592 rows.

--- Step 2: Building Behavioral Profiles (Full) ---
   (a) Calculating cash-out ratio and unique senders...
   (b) Calculating average time to cash-out (avg_time_to_cash_out)...
   (c) Aggregating final profiles...
Complex behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Training Isolation Forest Model (With New Features) ---
Fraud (Contamination) rate in the Smart Sample: 0.07%
Model training complete.

--- Step 5: Evaluating the Model (New Score) ---
Precision: 2.10%
Recall: 2.10%
F1-Score (New Final Score): 2.10%

Confusion Matrix:
[[192309    140]
 [   140      3]]

--- Smartest Code Execution Complete ---


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# --- (Change: We will use a Supervised model) ---
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
# --- (Change: We need to split the data) ---
from sklearn.model_selection import train_test_split
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Successful Model: Supervised) ---")
print("Objective: Prove that 'Behavioral Features' succeed with a Supervised model.")

# --- Load Data ---
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
# (Same code to build a balanced sample for training)
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)
df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Building Behavioral Profiles (Strong Features) ---
print("\n--- Step 2: Building Behavioral Profiles ---")
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 4: Prepare Train/Test Data (Supervised) ---
print("\n--- Step 4: Preparing Train/Test Data ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest', # Recipient's cash-out ratio
    'dest_unique_senders_dest', # Recipient's unique senders
    'dest_cash_out_ratio_orig', # Sender's cash-out ratio
    'dest_unique_senders_orig'  # Sender's unique senders
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer"

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# (Change: We split the data 70% for training and 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_true, test_size=0.3, random_state=42, stratify=y_true)
print(f"Data split into {len(X_train)} rows for training and {len(X_test)} rows for testing.")

# --- Step 5: Train Supervised RandomForest Model ---
print("\n--- Step 5: Training RandomForest Model (Supervised) ---")

# (class_weight='balanced' is very important for imbalanced data)
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Test on "new" data (X_test) that the model has never seen before
predictions = model.predict(X_test)
print("Model training complete.")

# --- Step 6: Evaluate (The Successful Result) ---
print("\n--- Step 6: Evaluating the Model (New Result) ---")

f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
cm = confusion_matrix(y_test, predictions)

print("!!! Results on 'Test Data' (data the model has never seen before) !!!")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (New Final Score): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)

# --- (The most important step: Why did the model succeed?) ---
print("\n--- Feature Importance (Why did the model succeed?) ---")
feature_imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print(feature_imp)

print("\n--- Successful Code Execution Complete ---")

--- Execution Started (Successful Model: Supervised) ---
Objective: Prove that 'Behavioral Features' succeed with a Supervised model.
Successfully loaded the full file (355422 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 355421 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Preparing Train/Test Data ---
Data split into 248794 rows for training and 106627 rows for testing.

--- Step 5: Training RandomForest Model (Supervised) ---
Model training complete.

--- Step 6: Evaluating the Model (New Result) ---
!!! Results on 'Test Data' (data the model has never seen before) !!!
Precision: 20.00%
Recall: 8.47%
F1-Score (New Final Score): 11.90%

Confusion Matrix:
[[106548     20]
 [    54      5]]

--- Feature Importance (Why did the model succeed?) ---
amount                      0.451471
type_encoded                0.251712
dest_uni

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# --- (Change: We will use LOF) ---
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Comparison with LOF) ---")
print("Objective: Test LOF's performance on the same behavioral features.")

# --- Load Data ---
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)
df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Build Behavioral Profiles (Strong Features) ---
print("\n--- Step 2: Building Behavioral Profiles ---")

# (a) Calculate basic statistics
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 4: Train Unsupervised LOF Model ---
print("\n--- Step 4: Training Local Outlier Factor (LOF) Model ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig'
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer" (for comparison only)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

contamination = y_true.mean()
print(f"Fraud (Contamination) rate in the Smart Sample: {contamination:.2%}")

# --- (This is the change) ---
# n_neighbors=20 is a common default setting
model = LocalOutlierFactor(n_neighbors=20, contamination=contamination)

# LOF uses .fit_predict() to give us -1 (anomalous) or 1 (normal)
predictions = model.fit_predict(X_scaled)
print("Model training complete.")

# --- Step 5: Evaluation (Comparison) ---
print("\n--- Step 5: Evaluating the LOF Model ---")

# Convert the model's "guess" (-1) to (1) to match 'isFraud'
y_pred = [1 if p == -1 else 0 for p in predictions]

f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (Final score for LOF): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)

print("\n--- LOF Code Execution Complete ---")

--- Execution Started (Comparison with LOF) ---
Objective: Test LOF's performance on the same behavioral features.
Successfully loaded the full file (490697 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 490696 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Training Local Outlier Factor (LOF) Model ---
Fraud (Contamination) rate in the Smart Sample: 0.05%
Model training complete.

--- Step 5: Evaluating the LOF Model ---
Precision: 9.52%
Recall: 9.52%
F1-Score (Final score for LOF): 9.52%

Confusion Matrix:
[[490256    209]
 [   209     22]]

--- LOF Code Execution Complete ---


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
# --- (Change: We will use neural network libraries) ---
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Smarter Autoencoder Model) ---")
print("Objective: Use a neural network (Autoencoder) to learn 'normal behavior' only.")

# --- Load Data ---
file_path = 'PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Building Behavioral Profiles (Full Dataset) ---
# (We need to build profiles on the full dataset for accuracy)
print("\n--- Step 1: Building Behavioral Profiles (Full) ---")
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df[df['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 2: Merge Features with Transactions ---
print("\n--- Step 2: Merging Features with Transactions ---")
df_model_data = pd.merge(df, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

# Fill NaNs from the merge
df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 3: Prepare Train/Test Data ---
print("\n--- Step 3: Preparing Train/Test Data ---")

# --- (FIX: Drop rows where the label 'isFraud' is NaN) ---
# The stratify parameter in train_test_split cannot handle NaN labels.
# These rows are unusable for training or testing anyway.
print(f"Original data size: {len(df_model_data)}")
df_model_data = df_model_data.dropna(subset=['isFraud'])
print(f"Data size after dropping NaN labels: {len(df_model_data)}")
# --- (End of Fix) ---

features_list = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig'
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features_list.append('type_encoded')

X_all_features = df_model_data[features_list]
y_all_labels = df_model_data['isFraud'] # The "Correct Answer"

# (1) Scale the features
scaler = StandardScaler()
X_all_features_scaled = scaler.fit_transform(X_all_features)

# (2) Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_all_features_scaled, y_all_labels,
    test_size=0.3, random_state=42, stratify=y_all_labels
)

# (3) (Most important step) Create 'clean training data'
# The model will train *only* on the normal transactions from the training set
X_train_normal = X_train[y_train == 0]
print(f"Data split. We will train on {len(X_train_normal)} 'normal' transactions.")

# --- Step 4: Build the Autoencoder Model ---
print("\n--- Step 4: Building the Autoencoder Model ---")
input_dim = X_train_normal.shape[1] # Number of features

input_layer = Input(shape=(input_dim, ))

# Encoder (The "compression" part)
encoder = Dense(input_dim // 2, activation='relu')(input_layer) # 6 -> 3
encoder = Dense(input_dim // 4, activation='relu')(encoder)     # 3 -> 1 (The "bottleneck")

# Decoder (The "reconstruction" part)
decoder = Dense(input_dim // 2, activation='relu')(encoder)     # 1 -> 3
decoder = Dense(input_dim, activation='linear')(decoder)        # 3 -> 6 (Original shape)

# Assemble the model
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
print("Model built successfully.")
autoencoder.summary()

# --- Step 5: Train the Model (On Normal Data Only) ---
print("\n--- Step 5: Training the model (on normal data only)... ---")
# The model "learns" how to reconstruct normal training data
autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=10, # 10 training cycles (can be increased)
    batch_size=32,
    shuffle=True,
    validation_data=(X_test, X_test), # It validates on the full test set
    verbose=1
)
print("Model training complete.")

# --- Step 6: Evaluate the Model (Calculate "Reconstruction Error") ---
print("\n--- Step 6: Evaluating the Model ---")

# (a) Calculate the reconstruction error for every transaction in the test set
predictions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
df_test = pd.DataFrame({'Reconstruction_Error': mse, 'True_Label': y_test})

# (b) Determine the anomaly "Threshold"
# We will flag anything "weirder" than 99% of the normal transactions
# (We use the clean training data to set this threshold)
train_predictions = autoencoder.predict(X_train_normal)
train_mse = np.mean(np.power(X_train_normal - train_predictions, 2), axis=1)
threshold = np.quantile(train_mse, 0.99) # Set threshold at the 99th percentile
print(f"Anomaly threshold determined at: {threshold:.4f}")

# (c) Make Predictions
# Any transaction whose "error" is higher than the threshold = anomaly (1)
y_pred = [1 if e > threshold else 0 for e in df_test['Reconstruction_Error']]

# (d) Calculate the final score
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n--- (Final Result for Autoencoder Model) ---")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (New Final Score): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)


--- Execution Started (Smarter Autoencoder Model) ---
Objective: Use a neural network (Autoencoder) to learn 'normal behavior' only.
Successfully loaded the full file (1159457 rows).

--- Step 1: Building Behavioral Profiles (Full) ---
Behavioral profiles created successfully.

--- Step 2: Merging Features with Transactions ---

--- Step 3: Preparing Train/Test Data ---
Original data size: 1159457
Data size after dropping NaN labels: 1159456
Data split. We will train on 810563 'normal' transactions.

--- Step 4: Building the Autoencoder Model ---
Model built successfully.



--- Step 5: Training the model (on normal data only)... ---
Epoch 1/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 0.7424 - val_loss: 0.5533
Epoch 2/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 2ms/step - loss: 0.6964 - val_loss: 0.4487
Epoch 3/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2ms/step - loss: 0.5880 - val_loss: 0.4374
Epoch 4/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2ms/step - loss: 0.6366 - val_loss: 0.4303
Epoch 5/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 2ms/step - loss: 0.5308 - val_loss: 0.4317
Epoch 6/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 0.4675 - val_loss: 0.4288
Epoch 7/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2ms/step - loss: 0.5977 - val_loss: 0.4300
Epoch 8/10
[1m25331/25331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37