In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# !pip install category_encoders

from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/ethereum-frauddetection-dataset/transaction_dataset.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print(df.info())

In [None]:
print(f"Number of rows in DataFrame is {df.shape[0]}")
print(f"Number of columns in DataFrame is {df.shape[1]}")

In [None]:
df.info()

In [None]:
print("Number of NaN values in each columns:\n\n", df.isnull().sum())

In [None]:
print(f"Total number of NaN values is {df.isnull().sum().sum()}")

In [None]:
print(f"Number of duplicated values in Index columns is {df.duplicated(subset='Index').sum()}")

In [None]:
print(f"Number of duplicated values is {df.duplicated().sum()}")

In [None]:
# Change the name of columns
df.columns = df.columns.str.strip().str.replace(r'\b\s+\b', '_', regex=True)
df.columns

In [None]:
df.describe().T

In [None]:
# Show number of values in each class in percent
df['FLAG'].value_counts(normalize=True) * 100

In [None]:
df[df.duplicated(subset='Index')]['FLAG'].value_counts(normalize=True) * 100

In [None]:
df.drop(columns=['Unnamed: 0', 'Index', 'Address'], axis=0, inplace=True)

In [None]:
df

In [None]:
categories = df.select_dtypes(include=['object']).columns

for i in categories:
    print(f"The number of unique values in {i} is {df[i].nunique()} and it has {df[i].isnull().sum()} NaN values")

In [None]:
numeric = df.select_dtypes(include=['number']).columns
constant_var = [i for i in numeric if df[i].var() == 0]
print(f"Number of features that have constant value is {len(constant_var)}")
constant_var

In [None]:
# Drop constant variance features
df.drop(columns=constant_var, axis=0, inplace=True)

In [None]:
df

In [None]:
plt.figure(figsize=(10, 7))
numeric = df.select_dtypes(include=['number']).columns
corr = df[numeric].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask=mask, annot=False, vmin=-1, vmax=1)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cmap='coolwarm', cbar=False)
plt.yticks([])
plt.show()

In [None]:
print(f"Number of rows that has at least one missing value: {df.isnull().any(axis=1).sum()}")
missing_mask = df.isnull().any(axis=1)

In [None]:
print(df.loc[missing_mask, 'FLAG'].value_counts())
print()
print(round(df.loc[missing_mask, 'FLAG'].value_counts(normalize=True), 2) * 100)

In [None]:
print(df['FLAG'].value_counts())
print()
print(round(df['FLAG'].value_counts(normalize=True), 2) * 100)

In [None]:
print(df[~missing_mask].shape)
sub_df = df[~missing_mask]

In [None]:
# Split the data into train and test set
X = sub_df.drop(columns='FLAG', axis=1)
y = sub_df['FLAG']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"Shape of X_train is {X_train.shape}")
print(f"Shape of y_train is {y_train.shape}")
print(f"Shape of X_test is {X_test.shape}")
print(f"Shape of y_test is {y_test.shape}")

In [None]:
encoder = TargetEncoder(cols=categories)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

In [None]:
y_test 

In [None]:
train = pd.concat([X_train_encoded, y_train], axis=1)
test  = pd.concat([X_test_encoded, y_test], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from category_encoders import TargetEncoder

In [None]:
#model = IsolationForest(contamination=0.22, random_state=42)
#model.fit(train)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from category_encoders import TargetEncoder

# Impute missing values (replace your dropping step)
df_imputed = df.copy()
numeric_cols = df_imputed.select_dtypes(include=['number']).columns
df_imputed[numeric_cols] = df_imputed[numeric_cols].fillna(0)
categorical_cols = ['ERC20_most_sent_token_type', 'ERC20_most_rec_token_type']
df_imputed[categorical_cols] = df_imputed[categorical_cols].fillna('Unknown')



In [None]:
# Split data
X = df_imputed.drop(columns='FLAG')
y = df_imputed['FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical columns
encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [None]:
# Train Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Predict and evaluate
lr_pred = lr_model.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred, target_names=['Non-Fraud (0)', 'Fraud (1)']))

# Test accuracy
test_accuracy = (lr_pred == y_test).mean() * 100
print(f"Logistic Regression Test accuracy: {test_accuracy:.2f}%")

In [None]:
# Test loop for indices 500-700
for i in range(1000,1300):
    example_row_scaled = X_test_scaled[i].reshape(1, -1)  # Use scaled features
    prediction = lr_model.predict(example_row_scaled)
    prob = lr_model.predict_proba(example_row_scaled)[0]  # Probability of each class
    true_label = y_test.iloc[i]
    print(f"Index: {i}")
    print(f"Prediction: {'Fraud (1)' if prediction[0] == 1 else 'Non-Fraud (0)'}")
    print(f"Probability (Non-Fraud, Fraud): [{prob[0]:.4f}, {prob[1]:.4f}]")
    print(f"True label (FLAG): {true_label} (1=fraud, 0=non-fraud)")
    print("\n")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X_train_encoded.columns,
    'Coefficient': lr_model.coef_[0]
}).sort_values('Coefficient', key=abs, ascending=False)
print("Top 10 Feature Importances:")
print(feature_importance.head(10))

In [None]:
anomaly_train_pred = model.predict(train)
anomaly_test_pred = model.predict(test)

is_train = np.sum(anomaly_train_pred == -1)
is_test = np.sum(anomaly_test_pred == -1)

print(f"Number of outliers in training set is {is_train}")
print(f"Number of outliers in testing set is {is_test}")

In [None]:
t_sne = TSNE(n_components=2, random_state=42)
X_train_embeded = t_sne.fit_transform(X_train_encoded)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(x = X_train_embeded[:, 0], y = X_train_embeded[:, 1], c=anomaly_train_pred)
plt.title('t-SNE Visualization')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
X_test_embeded = t_sne.fit_transform(X_test_encoded)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(x = X_test_embeded[:, 0], y = X_test_embeded[:, 1], c=anomaly_test_pred)
plt.title('t-SNE Visualization')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:
# After the existing predictions, add this to compute accuracy
# First, ensure predictions are on features only (correct the fit and predict if needed)

# Corrected training: Fit on features only (X_train_encoded)
model = IsolationForest(contamination=0.01, random_state=42)
model.fit(X_train_encoded)  # Use X_train_encoded instead of train (which includes FLAG)

# Corrected predictions
anomaly_train_pred = model.predict(X_train_encoded)
anomaly_test_pred = model.predict(X_test_encoded)

# Map predictions: -1 (anomaly) to 1 (fraud), 1 (normal) to 0 (non-fraud)
predicted_train_label = (anomaly_train_pred == -1).astype(int)
predicted_test_label = (anomaly_test_pred == -1).astype(int)

# Compute accuracy
train_accuracy = (predicted_train_label == y_train.values).mean() * 100
test_accuracy = (predicted_test_label == y_test.values).mean() * 100

print(f"Training accuracy: {train_accuracy:.2f}%")
print(f"Test accuracy: {test_accuracy:.2f}%")

In [None]:
# Save X_test to a CSV file
X_test.to_csv('/kaggle/working/X_test_full.csv', index=False)

# Print confirmation
print("X_test saved to /kaggle/working/X_test_full.csv")

In [None]:
X_test

In [None]:
# Extract the 10th row from X_test (raw features, including categorical)
i = 1200
example_row = X_test.iloc[i]
true_label = y_test.iloc[i]
print("True Labe:",true_label)
# Extract the 10th row from X_test_encoded (numerical features for model)
example_row_encoded = X_test_encoded.iloc[10].values.reshape(1, -1)  # Reshape to (1, 40) for predict
print("\nEncoded features for X_test[10]:")
print(example_row_encoded)

# Predict anomaly
prediction = model.predict(example_row_encoded)
anomaly_score = model.decision_function(example_row_encoded)  # Optional: anomaly score

# Get true label for comparison


# Output results
print("\nPrediction for X_test[" + str(i) +"]:")
print("Anomaly 1" if prediction[0] == -1 else "Normal 0")
print(f"Anomaly score: {anomaly_score[0]:.4f} (lower is more anomalous)")
print(f"True label (FLAG): {true_label} (1=fraud, 0=non-fraud)")

In [None]:
for i in range(500,700):
    example_row = X_test.iloc[i]
    true_label = y_test.iloc[i]
    print("True Labe:",true_label)
    example_row_encoded = X_test_encoded.iloc[i].values.reshape(1, -1)  # Reshape to (1, 40) for predict
    prediction = model.predict(example_row_encoded)
    anomaly_score = model.decision_function(example_row_encoded)  # Optional: anomaly score
    
    print(i)
    print("Anomaly 1" if prediction[0] == -1 else "Normal 0")
    print(f"Anomaly score: {anomaly_score[0]:.4f} (lower is more anomalous)")
    print(f"True label (FLAG): {true_label} (1=fraud, 0=non-fraud)")
    print("")
    print("")