In [2]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD") or ""
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create engine for PostgreSQL connection
DATABASE_URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)

print("✅ Database connection ready")


✅ Database connection ready


In [3]:
query = """
SELECT 
    rc.ad_id,
    rc.ip_address,
    rc.device_type,
    rc.click_time,
    ap.date,
    ap.impressions,
    ap.clicks,
    ap.conversions,
    ac.email
FROM clean.clicks rc
JOIN clean.ad_performance ap ON rc.ad_id = ap.ad_id
JOIN clean.connections ac ON rc.ad_id = ac.ad_id
LIMIT 1000
"""

df = pd.read_sql(query, engine)
print(f"✅ Loaded {len(df):,} rows from the clean schema")
df.head()


✅ Loaded 1,000 rows from the clean schema


Unnamed: 0,ad_id,ip_address,device_type,click_time,date,impressions,clicks,conversions,email
0,AD003,5314,2,2017-11-08 10:11:48,2025-07-12,2468,1846,1515,timothywise@example.com
1,AD003,5314,2,2017-11-08 10:11:48,2025-07-13,2370,1575,807,timothywise@example.com
2,AD003,5314,2,2017-11-08 10:11:48,2025-07-14,3244,2518,1644,timothywise@example.com
3,AD003,5314,2,2017-11-08 10:11:48,2025-07-15,2442,1912,283,timothywise@example.com
4,AD003,5314,2,2017-11-08 10:11:48,2025-07-16,1264,584,411,timothywise@example.com


In [4]:
df_features = df.groupby("ip_address").agg(
    total_clicks=("clicks", "sum"),
    total_conversions=("conversions", "sum")
).reset_index()

# Avoid division by zero
df_features["click_to_conv_ratio"] = df_features.apply(
    lambda row: row["total_clicks"] / row["total_conversions"]
    if row["total_conversions"] > 0 else np.inf,
    axis=1
)

df_features.head()


Unnamed: 0,ip_address,total_clicks,total_conversions,click_to_conv_ratio
0,5314,1329508,637659,2.084983


In [5]:
df_sorted = df.sort_values(by=["ip_address", "click_time"])
df_sorted["prev_click_time"] = df_sorted.groupby("ip_address")["click_time"].shift(1)
df_sorted["time_diff"] = (df_sorted["click_time"] - df_sorted["prev_click_time"]).dt.total_seconds()

avg_time = df_sorted.groupby("ip_address")["time_diff"].mean().reset_index()
avg_time.rename(columns={"time_diff": "avg_time_between_clicks_sec"}, inplace=True)

df_features = df_features.merge(avg_time, on="ip_address", how="left")
df_features.head()


Unnamed: 0,ip_address,total_clicks,total_conversions,click_to_conv_ratio,avg_time_between_clicks_sec
0,5314,1329508,637659,2.084983,0.0


In [6]:
unique_devices = df.groupby("ip_address")["device_type"].nunique().reset_index()
unique_devices.rename(columns={"device_type": "unique_devices_per_ip"}, inplace=True)

df_features = df_features.merge(unique_devices, on="ip_address", how="left")
df_features.head()


Unnamed: 0,ip_address,total_clicks,total_conversions,click_to_conv_ratio,avg_time_between_clicks_sec,unique_devices_per_ip
0,5314,1329508,637659,2.084983,0.0,1


In [7]:
def email_risk(email):
    if not isinstance(email, str):
        return 0.0
    email = email.lower()
    risky_domains = ["mailinator.com", "tempmail.com", "dispostable.com"]
    return 1.0 if any(domain in email for domain in risky_domains) else 0.0

email_scores = df.groupby("ip_address")["email"].apply(
    lambda emails: np.mean([email_risk(e) for e in emails])
).reset_index()

email_scores.rename(columns={"email": "avg_email_risk_score"}, inplace=True)

df_features = df_features.merge(email_scores, on="ip_address", how="left")
df_features.head()


Unnamed: 0,ip_address,total_clicks,total_conversions,click_to_conv_ratio,avg_time_between_clicks_sec,unique_devices_per_ip,avg_email_risk_score
0,5314,1329508,637659,2.084983,0.0,1,0.0


In [8]:
CLICK_TO_CONV_RATIO_THRESHOLD = 50
AVG_TIME_BETWEEN_CLICKS_THRESHOLD = 5
UNIQUE_DEVICES_THRESHOLD = 5
EMAIL_RISK_THRESHOLD = 0.5

df_features["flag_high_click_to_conv_ratio"] = df_features["click_to_conv_ratio"] > CLICK_TO_CONV_RATIO_THRESHOLD
df_features["flag_short_avg_time_between_clicks"] = df_features["avg_time_between_clicks_sec"] < AVG_TIME_BETWEEN_CLICKS_THRESHOLD
df_features["flag_many_devices_per_ip"] = df_features["unique_devices_per_ip"] > UNIQUE_DEVICES_THRESHOLD
df_features["flag_high_email_risk"] = df_features["avg_email_risk_score"] > EMAIL_RISK_THRESHOLD

df_features["fraud_risk_score"] = df_features[
    ["flag_high_click_to_conv_ratio",
     "flag_short_avg_time_between_clicks",
     "flag_many_devices_per_ip",
     "flag_high_email_risk"]
].sum(axis=1)

df_features["fraud_suspected"] = df_features["fraud_risk_score"] >= 2

print("✅ Fraud flags added")
df_features.head(10)


✅ Fraud flags added


Unnamed: 0,ip_address,total_clicks,total_conversions,click_to_conv_ratio,avg_time_between_clicks_sec,unique_devices_per_ip,avg_email_risk_score,flag_high_click_to_conv_ratio,flag_short_avg_time_between_clicks,flag_many_devices_per_ip,flag_high_email_risk,fraud_risk_score,fraud_suspected
0,5314,1329508,637659,2.084983,0.0,1,0.0,False,True,False,False,1,False


In [None]:
df_features.to_sql("fraud_features", engine, schema="bi", if_exists="replace", index=False)
print("✅ Features saved to schema 'bi' for Tableau/Looker")

✅ Features saved to schema 'bi' for Tableau/Looker
