In [7]:
import pandas as pd
import boto3

s3_bucket = "fraud-detection-buckets"
s3_key = "transformed/fraud_data.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=s3_bucket, Key=s3_key)

df = pd.read_csv(obj["Body"])
df.head()


Unnamed: 0,transaction_id,user_id,amount,location,transaction_type,fraud_probability
0,500278,236,1087.98,Los Angeles,Online,0.84
1,566538,176,4424.95,Los Angeles,Online,0.36
2,518238,754,1330.54,Los Angeles,Online,0.86
3,595525,855,243.23,Los Angeles,Offline,0.15
4,482449,458,1614.09,Los Angeles,Online,0.84


In [8]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df[['transaction_id', 'user_id', 'amount', 'location', 'transaction_type']]  # keep numeric + categorical
y = df['fraud_probability']

# One-hot encode categorical features
X = pd.get_dummies(X, columns=['transaction_type'])

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
train_df = pd.concat([y_train, X_train], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

train_file = "train_data.csv"
test_file = "test_data.csv"

train_df.to_csv(train_file, index=False, header=False)
test_df.to_csv(test_file, index=False, header=False)

# Upload to S3
s3_resource = boto3.resource("s3")
s3_resource.Bucket(s3_bucket).upload_file(train_file, "ml/train/train_data.csv")
s3_resource.Bucket(s3_bucket).upload_file(test_file, "ml/test/test_data.csv")

In [10]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

role = get_execution_role()
session = sagemaker.Session()

# XGBoost Container URI
container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

# Define the estimator
xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{s3_bucket}/ml/output/",
    sagemaker_session=session
)

# Set hyperparameters
xgb_estimator.set_hyperparameters(
    objective="reg:squarederror",
    num_round=100
)

# Define input data
train_input = TrainingInput(s3_data=f"s3://{s3_bucket}/ml/train/train_data.csv", content_type="csv")

# Train model
xgb_estimator.fit({"train": train_input})

2025-04-06 14:55:00 Starting - Starting the training job...
2025-04-06 14:55:15 Starting - Preparing the instances for training...
2025-04-06 14:55:37 Downloading - Downloading input data...
2025-04-06 14:56:28 Downloading - Downloading the training image......
2025-04-06 14:57:29 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-04-06 14:57:24.180 ip-10-2-77-208.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-04-06 14:57:24.210 ip-10-2-77-208.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-04-06:14:57:24:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-04-06:14:57:24:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-04-06:14:57:24:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-04-06:14:57:24:INFO] Runn

In [11]:
# Deploy the model to an endpoint
xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

------!

In [12]:
location_mapping = {'New York': 0, 'San Francisco': 1, 'Chicago': 2}
type_mapping = {'transfer': 0, 'payment': 1, 'withdrawal': 2}

sample = {
    'amount': 259.5,
    'location': location_mapping['New York'],
    'transaction_type': type_mapping['transfer']
}

payload = f"{sample['amount']},{sample['location']},{sample['transaction_type']}"

In [13]:
import pandas as pd
df = pd.read_csv('train_data.csv') 
print(df.columns)



Index(['0.2', '526526', '895', '4223.11', 'Los Angeles', '0', '1'], dtype='object')


In [14]:
import pandas as pd

df = pd.read_csv('train_data.csv') 
print(df.columns.tolist())

['0.2', '526526', '895', '4223.11', 'Los Angeles', '0', '1']


In [15]:
df.columns = df.columns.str.strip().str.lower()
print(df.columns.tolist())

['0.2', '526526', '895', '4223.11', 'los angeles', '0', '1']


In [16]:
import pandas as pd

df = pd.read_csv('train_data.csv')

print("Columns in the dataset:")
print(df.columns.tolist())

Columns in the dataset:
['0.2', '526526', '895', '4223.11', 'Los Angeles', '0', '1']


In [17]:
import pandas as pd

# Define correct column names manually
column_names = ['fraud_probability', 'user_id', 'transaction_id', 'amount', 'location', 'transaction_type', 'is_fraud']

# Read CSV with no header and assign names
df = pd.read_csv('train_data.csv', header=None, names=column_names)

print("✅ Columns in the dataset:")
print(df.columns.tolist())
print(df.head())

✅ Columns in the dataset:
['fraud_probability', 'user_id', 'transaction_id', 'amount', 'location', 'transaction_type', 'is_fraud']
   fraud_probability  user_id  transaction_id   amount     location  \
0               0.20   526526             895  4223.11  Los Angeles   
1               0.64   404096             795  3629.87  Los Angeles   
2               0.89   220524             989  3863.93  Los Angeles   
3               0.87   358679              36  2908.28  Los Angeles   
4               0.22   702622             894    17.56  Los Angeles   

   transaction_type  is_fraud  
0                 0         1  
1                 1         0  
2                 1         0  
3                 1         0  
4                 0         1  


In [18]:
from sklearn.preprocessing import LabelEncoder

location_encoder = LabelEncoder()
df['location'] = location_encoder.fit_transform(df['location'])

type_encoder = LabelEncoder()
df['transaction_type'] = type_encoder.fit_transform(df['transaction_type'])

In [19]:
import pickle

# Save location encoder
with open('location_encoder.pkl', 'wb') as f:
    pickle.dump(location_encoder, f)

# Save transaction type encoder
with open('type_encoder.pkl', 'wb') as f:
    pickle.dump(type_encoder, f)

print("✅ Encoders saved successfully.")

✅ Encoders saved successfully.


In [20]:
# Define features and label
X = df[['amount', 'location', 'transaction_type']]
y = df['is_fraud']

print("✅ Features and labels prepared.")

✅ Features and labels prepared.


In [21]:
processed_df = pd.concat([y, X], axis=1)
processed_df.to_csv('processed_train_data.csv', index=False, header=False)
print("✅ Processed training data saved.")

✅ Processed training data saved.


In [22]:
pip install sagemaker pandas xgboost

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
import xgboost as xgb

# Load processed training data
df = pd.read_csv('processed_train_data.csv', header=None)

# Split features and labels
y = df.iloc[:, 0]
X = df.iloc[:, 1:]

# Convert to DMatrix format
dtrain = xgb.DMatrix(X, label=y)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [24]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=50)

In [25]:
xgb_model.save_model('model.bst')

import tarfile
with tarfile.open('model.tar.gz', 'w:gz') as tar:
    tar.add('model.bst')

  xgb_model.save_model('model.bst')


In [26]:
import boto3
import sagemaker
from sagemaker import get_execution_role

s3 = boto3.client('s3')
session = sagemaker.Session()
bucket = session.default_bucket()  # Or specify your own bucket

s3_key = 'fraud-model/model.tar.gz'
s3.upload_file('model.tar.gz', bucket, s3_key)

model_uri = f's3://{bucket}/{s3_key}'
print("✅ Model uploaded to:", model_uri)

✅ Model uploaded to: s3://sagemaker-us-east-1-084375588036/fraud-model/model.tar.gz


In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load dataset with header=None since your file has no proper headers
try:
    df = pd.read_csv('train_data.csv', header=None)
    print("✅ CSV loaded successfully. First row:", df.iloc[0].tolist())
    
    # Assign proper column names based on your data structure
    # Modify these names according to what each column represents
    df.columns = [
        'transaction_id',  # example: 0.2
        'user_id',        # example: 526526
        'account_id',     # example: 895
        'amount',         # example: 4223.11
        'location',       # example: Los Angeles
        'transaction_type', # example: 0
        'is_fraud'        # example: 1
    ]
    print("✅ Assigned column names:", df.columns.tolist())
except Exception as e:
    print("❌ Error loading CSV:", e)
    exit()

# Verify we have the expected columns
required_columns = ['amount', 'location', 'transaction_type', 'is_fraud']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"❌ Missing required columns: {missing_columns}")
    print("Available columns:", df.columns.tolist())
    exit()

# Encode categorical features
try:
    print("Encoding categorical features...")
    location_encoder = LabelEncoder()
    df['location'] = location_encoder.fit_transform(df['location'])

    type_encoder = LabelEncoder()
    df['transaction_type'] = type_encoder.fit_transform(df['transaction_type'])
    print("✅ Encoding completed successfully")
except Exception as e:
    print("❌ Encoding error:", e)
    exit()

# Save encoders
try:
    with open('location_encoder.pkl', 'wb') as f:
        pickle.dump(location_encoder, f)
    with open('type_encoder.pkl', 'wb') as f:
        pickle.dump(type_encoder, f)
    print("✅ Encoders saved successfully")
except Exception as e:
    print("❌ Error saving encoders:", e)
    exit()

# Create features and labels
try:
    X = df[['amount', 'location', 'transaction_type']]
    y = df['is_fraud']
    processed_df = pd.concat([y, X], axis=1)
    processed_df.to_csv('processed_train_data.csv', index=False, header=False)
    print("✅ Processed data saved successfully")
except Exception as e:
    print("❌ Error processing training data:", e)
    exit()

✅ CSV loaded successfully. First row: [0.2, 526526, 895, 4223.11, 'Los Angeles', 0, 1]
✅ Assigned column names: ['transaction_id', 'user_id', 'account_id', 'amount', 'location', 'transaction_type', 'is_fraud']
Encoding categorical features...
✅ Encoding completed successfully
✅ Encoders saved successfully
✅ Processed data saved successfully


In [2]:
df.columns = [
    'transaction_id',  # change if this is something else
    'user_id',        # change if this is something else
    'account_id',     # change if this is something else
    'amount',         # appears to be monetary amount
    'location',       # appears to be city name
    'transaction_type', # appears to be numeric code
    'is_fraud'        # appears to be fraud label (0/1)
]

In [3]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load processed data
try:
    # Note: We use header=None because we saved without headers
    processed_df = pd.read_csv('processed_train_data.csv', header=None)
    print("✅ Processed data loaded successfully")
    
    # Split into features (X) and target (y)
    # Column 0 is the target (is_fraud), columns 1-3 are features
    X = processed_df.iloc[:, 1:4]  # amount, location, transaction_type
    y = processed_df.iloc[:, 0]    # is_fraud
    
    print(f"Data shape - Features: {X.shape}, Target: {y.shape}")
    print(f"Fraud rate: {y.mean():.2%}")  # Show class imbalance
except Exception as e:
    print("❌ Error loading processed data:", e)
    exit()

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Train Random Forest classifier
try:
    print("\nTraining Random Forest model...")
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight='balanced'  # Important for imbalanced fraud detection
    )
    model.fit(X_train, y_train)
    print("✅ Model trained successfully")
except Exception as e:
    print("❌ Error training model:", e)
    exit()

# Evaluate model
try:
    print("\nModel Evaluation:")
    
    # Test set predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probability of fraud
    
    # Metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print(f"\nROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
except Exception as e:
    print("❌ Error evaluating model:", e)
    exit()

# Save the trained model
try:
    with open('fraud_detection_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print("\n✅ Model saved as 'fraud_detection_model.pkl'")
except Exception as e:
    print("❌ Error saving model:", e)
    exit()

print("\nFraud detection pipeline completed successfully!")

✅ Processed data loaded successfully
Data shape - Features: (870, 3), Target: (870,)
Fraud rate: 47.36%

Train set: 696 samples
Test set: 174 samples

Training Random Forest model...
✅ Model trained successfully

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        92
           1       1.00      1.00      1.00        82

    accuracy                           1.00       174
   macro avg       1.00      1.00      1.00       174
weighted avg       1.00      1.00      1.00       174


Confusion Matrix:
[[92  0]
 [ 0 82]]

ROC AUC Score: 1.0000

✅ Model saved as 'fraud_detection_model.pkl'

Fraud detection pipeline completed successfully!


In [14]:
import boto3

runtime = boto3.client('sagemaker-runtime')

# Assume Los Angeles → 3
csv_input = "0.5,1500.0,3"

response = runtime.invoke_endpoint(
    EndpointName='sagemaker-xgboost-2025-04-06-14-50-16-020',
    ContentType='text/csv',
    Body=csv_input
)

prediction = response['Body'].read().decode('utf-8')
print("Prediction:", prediction)

Prediction: 0.5881953835487366



In [16]:
import pickle
import pandas as pd

# Load encoder
with open('location_encoder.pkl', 'rb') as f:
    location_encoder = pickle.load(f)

# Encode new input
location_encoded = location_encoder.transform(['Los Angeles'])[0]

# Then build input string
csv_input = f"0.37,1837.0,{location_encoded}"

In [17]:
import pickle
import boto3
import pandas as pd

# Load the location encoder
with open('location_encoder.pkl', 'rb') as f:
    location_encoder = pickle.load(f)

# Encode the location
location_encoded = location_encoder.transform(['Los Angeles'])[0]

# Build the CSV input string
csv_input = f"0.5,1500.0,{location_encoded}"

# Invoke SageMaker endpoint
runtime = boto3.client('sagemaker-runtime')

response = runtime.invoke_endpoint(
    EndpointName='sagemaker-xgboost-2025-04-06-14-50-16-020',  # Replace with your actual endpoint name
    ContentType='text/csv',
    Body=csv_input
)

# Decode and print the prediction
prediction = response['Body'].read().decode('utf-8')
print("Prediction:", prediction)

Prediction: 0.5881953835487366



In [18]:
import pickle
import boto3
import pandas as pd

# Load the location encoder
with open('location_encoder.pkl', 'rb') as f:
    location_encoder = pickle.load(f)

# Encode the location
location_encoded = location_encoder.transform(['New York'])[0]

# Build the CSV input string
csv_input = f"0.5,1500.0,{location_encoded}"

# Invoke SageMaker endpoint
runtime = boto3.client('sagemaker-runtime')

response = runtime.invoke_endpoint(
    EndpointName='sagemaker-xgboost-2025-04-06-14-50-16-020',  # Replace with your actual endpoint name
    ContentType='text/csv',
    Body=csv_input
)

# Decode and print the prediction
prediction = response['Body'].read().decode('utf-8')
print("Prediction:", prediction)

Prediction: 0.5881953835487366



In [26]:
import pandas as pd
import boto3

# Load CSV
df = pd.read_csv('train_data_1.csv')

# Initialize SNS
sns = boto3.client('sns', region_name='us-east-1')  # Change region if needed
topic_arn = 'arn:aws:sns:us-east-1:084375588036:Fraud-alert'  # Replace with your ARN

# Filter fraudulent transactions
fraud_transactions = df[df['is_fraud'] == 1]

# Notify for each fraudulent transaction
for index, row in fraud_transactions.iterrows():
    message = (
        f"🚨 Fraudulent Transaction Detected!\n"
        f"Amount: ${row['amount']}\n"
        f"User ID: {row['user_id']}\n"
        f"Transaction ID: {row['transaction_id']}\n"
        f"Location: {row['location']}\n"
    )
    sns.publish(
        TopicArn=topic_arn,
        Message=message,
        Subject="Fraud Alert Notification"
    )

print("Notifications sent for all fraudulent transactions.")


Notifications sent for all fraudulent transactions.
