In [1]:
#two types
#for synthetic and for anonymity. we will start with synthetic manually

In [2]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Synthetic data generation
from faker import Faker

# Scikit-learn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Visualization setup
sns.set(style='whitegrid')


ModuleNotFoundError: No module named 'faker'

In [None]:
# Initialize Faker
fake = Faker()

# Set seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 10000

# Fraud ratio ~2%
fraud_ratio = 0.02
n_fraud = int(n_samples * fraud_ratio)
n_legit = n_samples - n_fraud

# Transaction amounts: exponential for realistic skew
amount_legit = np.random.exponential(scale=70, size=n_legit)
amount_fraud = np.random.exponential(scale=300, size=n_fraud)

# Anonymized numeric features: normal distribution
feature_legit = np.random.normal(loc=0, scale=1, size=(n_legit, 4))
feature_fraud = np.random.normal(loc=2, scale=1.5, size=(n_fraud, 4))

# Combine
amounts = np.concatenate([amount_legit, amount_fraud])
features = np.vstack([feature_legit, feature_fraud])
labels = np.array([0]*n_legit + [1]*n_fraud)

# Create DataFrame
df = pd.DataFrame(features, columns=['feature_1', 'feature_2', 'feature_3', 'feature_4'])
df['transaction_amount'] = amounts
df['is_fraud'] = labels

df.head()

In [None]:
print(df.info())
print(df.describe())

# Class balance
sns.countplot(x='is_fraud', data=df)
plt.title('Fraud vs Non-Fraud Class Balance')
plt.show()

# Transaction amount distribution
plt.figure(figsize=(10,5))
sns.histplot(df['transaction_amount'], bins=50, kde=True)
plt.title('Transaction Amount Distribution')
plt.xlabel('Transaction Amount')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['is_fraud'] = y.values

plt.figure(figsize=(8,6))
sns.scatterplot(x='PC1', y='PC2', hue='is_fraud', data=pca_df, palette='Set1')
plt.title('PCA - 2D Projection')
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train fraud cases:", sum(y_train))
print("Test fraud cases:", sum(y_test))


In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print("Resampled train fraud cases:", sum(y_train_resampled))


In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', IsolationForest(contamination=fraud_ratio, random_state=42))
])

pipeline.fit(X_train_resampled)

y_pred_train = pipeline.predict(X_train_resampled)

# Note: Isolation Forest output: -1 for anomaly, 1 for normal
unique, counts = np.unique(y_pred_train, return_counts=True)
print("Isolation Forest predictions:", dict(zip(unique, counts)))

In [None]:
# 1. This synthetic dataset shows the class imbalance common in real-world fraud detection.
# 2. PCA scatterplot shows partial separation — more features may improve separation.
# 3. SMOTE can help balance training but must be carefully validated.
# 4. Pipelines ensure consistent preprocessing, which is critical for fraud detection.
# 5. Isolation Forest is an unsupervised anomaly detection method; supervised methods can also be tried.


In [None]:
#synthetic data demo, run the following on colab:
# https://colab.research.google.com/drive/1Xx_8rPkJy3stSN2dEg-AGDLfX34U5ltr#scrollTo=qZMM3daNxeGb