# Local Model
LocalModel is our Python interface that gives direct access to TurboML's machine learning models.

We will use the transactions.csv and labels.csv datasets for our experiments.

## Set up the environment and install TurboML's SDK.
We use `turboml-installer` to set up the environment for TurboML's SDK.

In [1]:
!pip install -q turboml-installer
import turboml_installer ; turboml_installer.install_on_colab()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.6 MB[0m [31m15.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m27.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h📦 Installing...
🩹 Patching environment...
⏲ Done in 0:02:17
🔁 Restarting kernel...


The kernel should now be restarted with TurboML's SDK installed.

## Login to your TurboML instance

Note that you can copy and replace this snippet with one from your TurboML homepage.

In [1]:
import turboml as tb
# tb.init(backend_url=BACKEND_URL, api_key=API_KEY)

In [2]:
import pandas as pd
from turboml import LocalModel
from turboml.common.models import InputSpec
import numpy as np
from sklearn import metrics
import time
import base64

## Load Datasets

In [3]:
transactions = tb.datasets.FraudDetectionDatasetFeatures()
labels = tb.datasets.FraudDetectionDatasetLabels()

transactions_train = transactions[:100000]
labels_train = labels[:100000]

transactions_test = transactions[100000:120000]
labels_test = labels[100000:120000]

## Define Input Specification

In [4]:
numerical_fields = [
    "transactionAmount",
    "localHour",
]

categorical_fields = [
    "digitalItemCount",
    "physicalItemCount",
    "isProxyIP",
]

input_spec = InputSpec(
    key_field="index",
    numerical_fields=numerical_fields,
    categorical_fields=categorical_fields,
    textual_fields=[],
    imaginal_fields=[],
    time_field="",
    label_field="is_fraud",
)

## Prepare Input and Label Data

In [None]:
train_features = transactions_train.get_model_inputs(
    numerical_fields=numerical_fields, categorical_fields=categorical_fields
)
train_labels = labels_train.get_model_labels(label_field="is_fraud")

test_features = transactions_test.get_model_inputs(
    numerical_fields=numerical_fields, categorical_fields=categorical_fields
)
test_labels = labels_test.get_model_labels(label_field="is_fraud")

## Define Model Configurations

In [None]:
hoeffding_tree = tb.HoeffdingTreeClassifier(
    delta=1e-7,
    tau=0.05,
    grace_period=200,
    n_classes=2,
    leaf_pred_method="mc",
    split_method="gini",
)

amf_classifier = tb.AMFClassifier(
    n_classes=2,
    n_estimators=10,
    step=1,
    use_aggregation=True,
    dirichlet=0.5,
    split_pure=False,
)

multinomial_nb = tb.MultinomialNB(n_classes=2, alpha=1.0)

In [None]:
# Convert each Model instance to LocalModel
hoeffding_tree_local = hoeffding_tree.to_local_model(input_spec)
amf_classifier_local = amf_classifier.to_local_model(input_spec)
multinomial_nb_local = multinomial_nb.to_local_model(input_spec)

## Training and Evaluation Function

In [None]:
# Store trained models and predictions
model_trained_100K = {}
initial_results = {}

models_to_train = [
    ("HoeffdingTree", hoeffding_tree_local),
    ("AMF", amf_classifier_local),
    ("MultinomialNB", multinomial_nb_local),
]

In [None]:
for name, model in models_to_train:
    try:
        print(f"Training {name} model on first 100K records...")
        model.learn(train_features, train_labels)

        predictions = model.predict(test_features)
        roc_auc = metrics.roc_auc_score(
            test_labels.dataframe["is_fraud"], predictions["score"]
        )
        accuracy = metrics.accuracy_score(
            test_labels.dataframe["is_fraud"], predictions["predicted_class"]
        )

        print(f"{name} Model Results:")
        print(f"ROC AUC Score: {roc_auc:.4f}")
        print(f"Accuracy Score: {accuracy:.4f}")

        # Store results
        model_trained_100K[name] = model
        initial_results[name] = predictions

    except Exception as e:
        print(f"Error with {name} model: {str(e)}")

## Further Training in Batches
We will continue training the Hoeffding Tree model with additional data in batches.

In [None]:
model_hoeffding_tree = model_trained_100K.get("HoeffdingTree")
start = 100000
step = 100
stop = 102000

if model_hoeffding_tree is not None:
    # Split the dataset into 10 parts for batch training
    pos = start
    i = 0
    while pos < stop - step:
        print(f"\nPreparing batch {i + 1}...")
        feat_batch = transactions[pos : pos + step].get_model_inputs(
            numerical_fields=numerical_fields,
            categorical_fields=categorical_fields,
        )
        label_batch = labels[pos : pos + step].get_model_labels(label_field="is_fraud")
        pos = pos + step
        i += 1

        print(f"Training batch {i + 1}...")
        start_time = time.time()
        model_hoeffding_tree.learn(feat_batch, label_batch)
        end_time = time.time()
        print(
            f"Batch {i + 1} training completed in {end_time - start_time:.2f} seconds."
        )
else:
    print("Hoeffding Tree model not found in trained models.")

## ONNX Model

In [6]:
!pip install -q onnx==1.14.1 scikit-learn skl2onnx river

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Prepare features and target
transactions_df = transactions.df
labels_df = labels.df

X = transactions_df[numerical_fields + categorical_fields + ["transactionID"]]
y = labels_df["is_fraud"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train sklearn model
clf = RandomForestClassifier()
clf.fit(X_train[numerical_fields + categorical_fields], y_train)

In [8]:
# Convert to ONNX format
# initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))] (X)

# DC_Change: ONNX with input dimensions same as training (number of features = 5)
initial_type = [("float_input", FloatTensorType([None, len(numerical_fields + categorical_fields)]))]
onx = convert_sklearn(
    clf, initial_types=initial_type, options={type(clf): {"zipmap": False}}
)

# Get the serialized ONNX model
onnx_model_data = onx.SerializeToString()
# Base64-encode the ONNX model data
model_data_base64 = base64.b64encode(onnx_model_data).decode("utf-8")

In [9]:
# Create ONNX model config with the encoded model data
onnx_model_config = [
    {
        "algorithm": "ONNX",
        "onnx_config": {
            "model_save_name": "randomforest",
            "model_data": model_data_base64,
        },
    }
]


onnx_input_spec = InputSpec(
    key_field="index",
    numerical_fields=numerical_fields + categorical_fields,
    categorical_fields=[],
    textual_fields=[],
    imaginal_fields=[],
    time_field="",
    label_field="is_fraud",
)

local_onnx_model = LocalModel(
    model_configs=onnx_model_config,
    input_spec=onnx_input_spec,
)

In [10]:
# train data
train_input_data = tb.LocalDataset.from_pd(
    df=X_train, key_field="transactionID"
).get_model_inputs(numerical_fields=numerical_fields + categorical_fields)


train_label_data = tb.LocalDataset.from_pd(
    df=pd.DataFrame({"transactionID": X_train.transactionID, "is_fraud": y_train}),
    key_field="transactionID",
).get_model_labels(label_field="is_fraud")

In [11]:
# Create test input data
test_input_data = tb.LocalDataset.from_pd(
    df=X_test, key_field="transactionID"
).get_model_inputs(numerical_fields=numerical_fields + categorical_fields)


test_label_data = tb.LocalDataset.from_pd(
    df=pd.DataFrame({"transactionID": X_test.transactionID, "is_fraud": y_test}),
    key_field="transactionID",
).get_model_labels(label_field="is_fraud")

In [12]:
def onnx_model():
    try:
        # Get predictions
        predictions = local_onnx_model.predict(test_input_data)

        # Calculate metrics
        roc_auc = metrics.roc_auc_score(
            test_label_data.dataframe["is_fraud"],
            predictions["score"],
        )
        accuracy = metrics.accuracy_score(
            test_label_data.dataframe["is_fraud"],
            predictions["predicted_class"],
        )

        print("ONNX Model Results:")
        print(f"ROC AUC Score: {roc_auc:.4f}")
        print(f"Accuracy Score: {accuracy:.4f}")

        return predictions

    except Exception as e:
        print(f"Error testing ONNX model: {str(e)}")
        return None


# Run the test
predictions = onnx_model()

if predictions is not None:
    # Previous code
    # sklearn_preds = clf.predict(X_test)

    # DC_Change: Drop ID Column as it is not a Training Features
    sklearn_preds = clf.predict(X_test.drop(columns=["transactionID"]))
    onnx_preds = predictions["predicted_class"]

    match_rate = (sklearn_preds == onnx_preds).mean()
    print("\nPrediction Comparison:")
    print(f"Sklearn vs ONNX prediction match rate: {match_rate:.4f}")

ONNX Model Results:
ROC AUC Score: 0.6597
Accuracy Score: 0.9946

Prediction Comparison:
Sklearn vs ONNX prediction match rate: 0.9987


## Python Model Testing

In [None]:
python_model_code = """
from river import linear_model
import turboml.common.pytypes as types

class MyLogisticRegression:

    def init_imports(self):
        from river import linear_model
        import turboml.common.pytypes as types

    def __init__(self):
        self.model = linear_model.LogisticRegression()

    def learn_one(self, input):
        # Combine numerical and categorical features into a dictionary
        features = {}
        features.update({f'num_{i}': val for i, val in enumerate(input.numeric)})
        features.update({f'cat_{i}': val for i, val in enumerate(input.categ)})
        self.model.learn_one(features, input.label)

    def predict_one(self, input, output):
        # Combine numerical and categorical features into a dictionary
        features = {}
        features.update({f'num_{i}': val for i, val in enumerate(input.numeric)})
        features.update({f'cat_{i}': val for i, val in enumerate(input.categ)})
        proba = self.model.predict_proba_one(features)
        score = float(proba.get(True, 0))
        output.set_score(score)
        output.set_predicted_class(int(score >= 0.5))
"""

In [None]:
# Define the model configuration
python_model_config = {
    "algorithm": "Python",
    "python_config": {
        "class_name": "MyLogisticRegression",
        "code": python_model_code,
    },
}

# Create the LocalModel instance
local_python_model = LocalModel(
    model_configs=[python_model_config],
    input_spec=input_spec,
)

In [None]:
# Train the model
local_python_model.learn(train_input_data, train_label_data)

# Make predictions
predictions = local_python_model.predict(test_input_data)

# Evaluate the model
roc_auc = metrics.roc_auc_score(
    test_label_data.dataframe["is_fraud"], predictions["score"]
)
accuracy = metrics.accuracy_score(
    test_label_data.dataframe["is_fraud"], predictions["predicted_class"]
)

print(f"Python Model ROC AUC Score: {roc_auc:.4f}")
print(f"Python Model Accuracy Score: {accuracy:.4f}")

## Python Ensemble Model

In [None]:
# Base models (already defined and trained)
hoeffding_tree_model = model_trained_100K["HoeffdingTree"]
amf_classifier_model = model_trained_100K["AMF"]
multinomial_nb_model = model_trained_100K["MultinomialNB"]

# Extract base model configurations
base_model_configs = [
    hoeffding_tree_model.model_configs[0],
    amf_classifier_model.model_configs[0],
    multinomial_nb_model.model_configs[0],
]

In [None]:
# Prepare ensemble model code
ensemble_model_code = """
import turboml.common.pymodel as model
from typing import List

class MyEnsembleModel:
    def __init__(self, base_models: List[model.Model]):
        if not base_models:
            raise ValueError("PythonEnsembleModel requires at least one base model.")
        self.base_models = base_models

    def init_imports(self):
        import turboml.common.pytypes as types
        from typing import List

    def learn_one(self, input):
        for model in self.base_models:
            model.learn_one(input)

    def predict_one(self, input, output):
        total_score = 0.0
        for model in self.base_models:
            model_output = model.predict_one(input)
            total_score += model_output.score()
        average_score = total_score / len(self.base_models)
        output.set_score(average_score)
        output.set_predicted_class(int(average_score >= 0.5))
"""

In [None]:
# Define the ensemble model configuration
ensemble_model_config = {
    "algorithm": "PythonEnsembleModel",
    "python_ensemble_config": {
        "class_name": "MyEnsembleModel",
        "code": ensemble_model_code,
    },
}

# Combine the ensemble model config and base model configs
model_configs = [ensemble_model_config] + base_model_configs

# Create the ensemble LocalModel instance
ensemble_model = tb.LocalModel(
    model_configs=model_configs,
    input_spec=input_spec,
)

In [None]:
# Train the ensemble model
ensemble_model.learn(train_input_data, train_label_data)

# Make predictions with the ensemble model
ensemble_predictions = ensemble_model.predict(test_input_data)

# Evaluate the ensemble model
roc_auc = metrics.roc_auc_score(
    test_label_data.dataframe["is_fraud"], ensemble_predictions["score"]
)
accuracy = metrics.accuracy_score(
    test_label_data.dataframe["is_fraud"], ensemble_predictions["predicted_class"]
)

print(f"Ensemble Model ROC AUC Score: {roc_auc:.4f}")
print(f"Ensemble Model Accuracy Score: {accuracy:.4f}")

In [37]:
import pandas as pd

df = pd.read_parquet("images_dataset.parquet")

In [38]:
df.columns

Index(['image_path', 'category', 'image_bytes'], dtype='object')

In [39]:
df.drop(columns=["image_path"], inplace=True)

In [40]:
df.to_parquet("images_dataset.parquet", index=False)

In [41]:
df.columns

Index(['category', 'image_bytes'], dtype='object')

In [1]:
import psycopg2
import os

# Fetch variables
USER="postgres"
PASSWORD="Frictionless01#" 
HOST="db.vtrwmxkgagykvqrwxhft.supabase.co"
PORT=5432
DBNAME="postgres"

# Connect to the database
try:
    connection = psycopg2.connect(
        user=USER,
        password=PASSWORD,
        host=HOST,
        port=PORT,
        dbname=DBNAME
    )
    print("Connection successful!")
    
    # Create a cursor to execute SQL queries
    cursor = connection.cursor()
    
    # Example query
    cursor.execute("SELECT NOW();")
    result = cursor.fetchone()
    print("Current Time:", result)

    # Close the cursor and connection
    cursor.close()
    connection.close()
    print("Connection closed.")

except Exception as e:
    print(f"Failed to connect: {e}")

Connection successful!
Current Time: (datetime.datetime(2025, 2, 14, 8, 29, 54, 272121, tzinfo=datetime.timezone.utc),)
Connection closed.


In [7]:
df_sample = df.sample(5)

In [9]:
df_sample.head()

Unnamed: 0,category,image_bytes
1350,neutral,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
1019,hentai,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
7175,hentai,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
9160,sexy,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
3561,hentai,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...


In [50]:
import psycopg2
from psycopg2 import sql, Binary
import base64

# 1) Connect to the database
connection = psycopg2.connect(
    user="postgres",
    password="Frictionless01#",
    host="db.vtrwmxkgagykvqrwxhft.supabase.co",
    port=5432,
    dbname="postgres"
)
cursor = connection.cursor()

# 2) Prepare the INSERT query
#    If your table name contains a dash, be sure to use double-quotes around it.
insert_query = """
    INSERT INTO "nsfw-db" (category, image_bytes)
    VALUES (%s, %s)
"""

# 3) Insert each row from your DataFrame
#    - If image_bytes is already raw bytes, use psycopg2.Binary directly.
#    - If image_bytes is a base64-encoded string, decode it before insertion.
records = []
for idx, row in df.iterrows():
    cat_val = row["category"]
    
    # If 'image_bytes' is already raw bytes, you can do:
    #   img_bytes_val = row["image_bytes"]
    # If it's base64-encoded, decode first:
    #   img_bytes_val = base64.b64decode(row["image_bytes"])
    img_bytes_val = row["image_bytes"]
    
    # Optionally decode if it's base64 (uncomment if needed):
    # if isinstance(img_bytes_val, str):
    #     img_bytes_val = base64.b64decode(img_bytes_val)
    
    records.append((cat_val, Binary(img_bytes_val)))

# 4) Use executemany for bulk insert
cursor.executemany(insert_query, records)
connection.commit()

# 5) Close cursor & connection
cursor.close()
connection.close()

print("Data inserted successfully!")


Data inserted successfully!


In [2]:
import psycopg2
from psycopg2 import sql, Binary
import base64

# 1) Connect to the database
connection = psycopg2.connect(
    user="postgres",
    password="Frictionless01#",
    host="db.vtrwmxkgagykvqrwxhft.supabase.co",
    port=5432,
    dbname="postgres"
)

# Fetch the table into a DataFrame
query = 'SELECT * FROM "nsfw-db";'  # Use double quotes for the table name
df = pd.read_sql_query(query, connection)

# Convert BYTEA (if needed)
df["image_bytes"] = df["image_bytes"].apply(bytes)
connection.close()


  df = pd.read_sql_query(query, connection)


In [5]:
df.sample(5)

Unnamed: 0,id,created_at,image_bytes,category
875,876,2025-02-14 10:03:17.112239+00:00,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,hentai
104,105,2025-02-14 10:03:17.112239+00:00,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,sexy
585,586,2025-02-14 10:03:17.112239+00:00,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,neutral
824,825,2025-02-14 10:03:17.112239+00:00,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,neutral
690,691,2025-02-14 10:03:17.112239+00:00,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,drawings


In [35]:
df.to_parquet("images_dataset_sample.parquet", index=False)

In [19]:
!pip install -q supabase


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from supabase import create_client

# Supabase credentials
SUPABASE_URL = "https://vtrwmxkgagykvqrwxhft.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZ0cndteGtnYWd5a3Zxcnd4aGZ0Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MzkzMzQxOTQsImV4cCI6MjA1NDkxMDE5NH0.LcG2jicMkH3x0ykx8HwMhljZOJwlpeMAOlfnQU3USQA"

# Initialize Supabase client
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

In [32]:
response = supabase.table("nsfw-db").select("*").execute()

In [33]:
response

APIResponse[TypeVar](data=[], count=None)

In [36]:
df.columns

Index(['id', 'created_at', 'image_bytes', 'category'], dtype='object')

In [45]:
import pandas as pd
import uuid
from datetime import datetime

# Adding a unique index column
df['image_id'] = [uuid.uuid4().hex for _ in range(len(df))]  # Using UUID for uniqueness

# # Adding a timestamp column
# df['timestamp'] = datetime.now()  # Assigns the same timestamp to all rows

#If you want unique timestamps per row:
df['timestamp'] = [datetime.now() for _ in range(len(df))]


In [None]:
df.columns

Index(['category', 'image_bytes', 'image_id', 'timestamp'], dtype='object')

In [48]:
df.sample(1000, random_state=42).to_parquet("images_dataset_sample.parquet", index=False)

In [6]:
df["created_at"].dtype

datetime64[ns, UTC]

In [None]:
import psycopg2

conn = psycopg2.connect('postgres://avnadmin:AVNS_KS9zkbvfdCLbvPgldR3@nsfw-service-ace941166-e983.h.aivencloud.com:23266/defaultdb?sslmode=require')

query_sql = 'SELECT VERSION()'

cur = conn.cursor()
cur.execute(query_sql)

version = cur.fetchone()[0]
print(version)


In [7]:
import psycopg2
from psycopg2 import sql, Binary
import base64

# 1) Connect to the database
connection = psycopg2.connect(
    user="koyeb-adm",
    password="npg_1RSZrqO3dPnu",
    host="ep-billowing-frost-a10a9gce.ap-southeast-1.pg.koyeb.app",
    port=5432,
    dbname="koyebdb"
)
cursor = connection.cursor()
# Test the connection
cursor.execute("SELECT NOW();")
result = cursor.fetchone()
print("Current Time:", result)
# Close connection
cursor.close()
connection.close()

Current Time: (datetime.datetime(2025, 2, 14, 11, 10, 11, 817271, tzinfo=datetime.timezone.utc),)


In [15]:
# Drop and recreate table with auto-incrementing image_id
create_table_query = """
DROP TABLE IF EXISTS "nsfw-db";
CREATE TABLE "nsfw-db" (
    image_id SERIAL PRIMARY KEY,
    category VARCHAR(255),
    image_bytes BYTEA,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""

# Connect to the database
connection = psycopg2.connect(
    user="koyeb-adm",
    password="npg_1RSZrqO3dPnu",
    host="ep-billowing-frost-a10a9gce.ap-southeast-1.pg.koyeb.app",
    port=5432,
    dbname="koyebdb"
)
cursor = connection.cursor()
# Execute the query
cursor.execute(create_table_query)
connection.commit()
# Close the connection
cursor.close()
connection.close()

In [19]:
# Check if the table was created
# Connect to the database
connection = psycopg2.connect(
    user="koyeb-adm",
    password="npg_1RSZrqO3dPnu",
    host="ep-billowing-frost-a10a9gce.ap-southeast-1.pg.koyeb.app",
    port=5432,
    dbname="koyebdb"
)
cursor = connection.cursor()
# Query the table
cursor.execute('SELECT * FROM "nsfw-db";')
result = cursor.fetchall()
print(result)
# Close the connection
cursor.close()
connection.close()

[(1, 'porn', <memory at 0x000001B64E10F1C0>, datetime.datetime(2025, 2, 14, 11, 22, 46, 787387)), (2, 'sexy', <memory at 0x000001B64E10F340>, datetime.datetime(2025, 2, 14, 11, 22, 46, 787387)), (3, 'drawings', <memory at 0x000001B64E10ED40>, datetime.datetime(2025, 2, 14, 11, 22, 46, 787387)), (4, 'sexy', <memory at 0x000001B64E10F7C0>, datetime.datetime(2025, 2, 14, 11, 22, 46, 787387)), (5, 'neutral', <memory at 0x000001B64E10F700>, datetime.datetime(2025, 2, 14, 11, 22, 46, 787387))]


In [17]:
# Insert 5 rows of dataframe df to check
# Connect to the database
connection = psycopg2.connect(
    user="koyeb-adm",
    password="npg_1RSZrqO3dPnu",
    host="ep-billowing-frost-a10a9gce.ap-southeast-1.pg.koyeb.app",
    port=5432,
    dbname="koyebdb"
)
cursor = connection.cursor()

# Prepare the INSERT query
insert_query = """
    INSERT INTO "nsfw-db" (category, image_bytes)
    VALUES (%s, %s)
"""

# Insert each row from the DataFrame
records = []
for idx, row in df.sample(5).iterrows():
    cat_val = row["category"]
    img_bytes_val = row["image_bytes"]
    records.append((cat_val, Binary(img_bytes_val)))
    
# 4) Use executemany for bulk insert
res = cursor.executemany(insert_query, records)
connection.commit()

# 5) Close cursor & connection
cursor.close()
connection.close()

print("Data inserted successfully!")

Data inserted successfully!


In [None]:
connection = psycopg2.connect(
    user="koyeb-adm",
    password="npg_1RSZrqO3dPnu",
    host="ep-billowing-frost-a10a9gce.ap-southeast-1.pg.koyeb.app",
    port=5432,
    dbname="koyebdb"
)

In [21]:
# Fetch the table into a DataFrame
query = 'SELECT * FROM "nsfw-db";'  # Use double quotes for the table name
df = pd.read_sql_query(query, connection)

# Convert BYTEA (if needed)
df["image_bytes"] = df["image_bytes"].apply(bytes)
connection.close()

  df = pd.read_sql_query(query, connection)


In [22]:
df

Unnamed: 0,image_id,category,image_bytes,created_at
0,1,porn,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,2025-02-14 11:22:46.787387
1,2,sexy,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,2025-02-14 11:22:46.787387
2,3,drawings,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,2025-02-14 11:22:46.787387
3,4,sexy,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,2025-02-14 11:22:46.787387
4,5,neutral,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,2025-02-14 11:22:46.787387


In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("UNSW_NB15_traintest_backdoor.csv")

In [7]:
df.shape

(95329, 197)

In [13]:
df.head()

Unnamed: 0,dur,proto=udp,proto=arp,proto=tcp,proto=igmp,proto=ospf,proto=sctp,proto=gre,proto=ggp,proto=ip,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,class
0,0.0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.016393,0,0
1,0.0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.016393,0,0
2,0.0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.032787,0,0
3,0.0,1,0,0,0,0,0,0,0,0,...,0.022222,0.0,0.032258,0.0,0.0,0.0,0.021739,0.032787,0,0
4,0.0,1,0,0,0,0,0,0,0,0,...,0.022222,0.0,0.032258,0.0,0.0,0.0,0.021739,0.032787,0,0


In [16]:
df["dur"].nunique()

24518