In [1]:
# 📦 Imports
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# 🔑 Load environment variables
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD") or ""
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# 🔌 Connect to PostgreSQL
DATABASE_URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)
print("✅ Connected to Postgres")


✅ Connected to Postgres


In [2]:
# 📥 Load cleaned ads & clicks from the clean schema
query_ads = "SELECT * FROM clean.mv_ads"
query_clicks = "SELECT * FROM clean.mv_raw_clicks"
query_conn = "SELECT * FROM clean.mv_ad_connections"

df_ads = pd.read_sql(query_ads, engine)
df_clicks = pd.read_sql(query_clicks, engine)
df_conn = pd.read_sql(query_conn, engine)

print(f"Loaded {len(df_ads)} ads, {len(df_clicks)} clicks, {len(df_conn)} connections.")


ProgrammingError: (psycopg2.errors.UndefinedTable) relation "clean.mv_ads" does not exist
LINE 1: SELECT * FROM clean.mv_ads
                      ^

[SQL: SELECT * FROM clean.mv_ads]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [3]:
# 📊 CTR = clicks / impressions (approximation using raw_clicks count)
ctr_df = df_clicks.groupby("ad_id").agg(
    total_clicks=("ip_address", "count"),
    unique_ips=("ip_address", "nunique")
).reset_index()

# Merge with ads info
ctr_df = ctr_df.merge(df_ads[["ad_id", "advertiser", "category"]], on="ad_id", how="left")

# Compute click ratio
ctr_df["clicks_per_unique_ip"] = ctr_df["total_clicks"] / ctr_df["unique_ips"]

ctr_df.head()


NameError: name 'df_clicks' is not defined

In [None]:
df_clicks["click_time"] = pd.to_datetime(df_clicks["click_time"])

# Difference between clicks for each ad (in seconds)
time_diff_df = df_clicks.sort_values(["ad_id", "click_time"]).groupby("ad_id")["click_time"].diff().dt.total_seconds()

# Aggregate stats
ttc_df = df_clicks.assign(time_diff=time_diff_df).groupby("ad_id").agg(
    avg_time_between_clicks=("time_diff", "mean"),
    median_time_between_clicks=("time_diff", "median")
).reset_index()

ttc_df.head()


In [None]:
# 📌 Define suspicious if > threshold clicks from same IP on same ad
threshold = 10  # You can tweak this
ip_clicks = df_clicks.groupby(["ad_id", "ip_address"]).size().reset_index(name="click_count")
ip_clicks["suspicious_ip"] = (ip_clicks["click_count"] > threshold).astype(int)

# Aggregate to ad-level suspicious IP ratio
suspicious_ip_df = ip_clicks.groupby("ad_id")["suspicious_ip"].mean().reset_index()
suspicious_ip_df.rename(columns={"suspicious_ip": "ratio_suspicious_ips"}, inplace=True)

suspicious_ip_df.head()


In [None]:
# Extract email domain from ad_connections
df_conn["email_domain"] = df_conn["email"].str.extract(r'@(.+)$')

# List of suspicious domains (temp mail services)
temp_domains = ["tempmail.com", "mailinator.com", "10minutemail.com"]

# Flag suspicious domains
df_conn["temp_email_flag"] = df_conn["email_domain"].isin(temp_domains).astype(int)

email_flags = df_conn.groupby("ad_id")["temp_email_flag"].mean().reset_index()
email_flags.rename(columns={"temp_email_flag": "ratio_temp_emails"}, inplace=True)

email_flags.head()


In [None]:
# Count distinct ads per IP
ip_ad_counts = df_conn.groupby("ip_address")["ad_id"].nunique().reset_index(name="ads_per_ip")

# Join back to connections
df_conn = df_conn.merge(ip_ad_counts, on="ip_address", how="left")

# Aggregate max ads per IP per ad_id (proxy for click farms)
clickfarm_df = df_conn.groupby("ad_id")["ads_per_ip"].max().reset_index()
clickfarm_df.rename(columns={"ads_per_ip": "max_ads_per_ip"}, inplace=True)

clickfarm_df.head()


In [None]:
# Invalid if null, empty, or weird values
df_clicks["invalid_device"] = df_clicks["device_type"].isna() | (df_clicks["device_type"].str.strip() == "")

device_flags = df_clicks.groupby("ad_id")["invalid_device"].mean().reset_index()
device_flags.rename(columns={"invalid_device": "ratio_invalid_devices"}, inplace=True)

device_flags.head()


In [None]:
# Broadcast IP example: 255.255.255.255
df_clicks["broadcast_ip_flag"] = (df_clicks["ip_address"] == "255.255.255.255").astype(int)

broadcast_flags = df_clicks.groupby("ad_id")["broadcast_ip_flag"].mean().reset_index()
broadcast_flags.rename(columns={"broadcast_ip_flag": "ratio_broadcast_ips"}, inplace=True)

broadcast_flags.head()


In [None]:
features_df = ctr_df \
    .merge(ttc_df, on="ad_id", how="left") \
    .merge(suspicious_ip_df, on="ad_id", how="left") \
    .merge(email_flags, on="ad_id", how="left") \
    .merge(clickfarm_df, on="ad_id", how="left") \
    .merge(device_flags, on="ad_id", how="left") \
    .merge(broadcast_flags, on="ad_id", how="left")

# Replace NaNs with 0
features_df = features_df.fillna(0)

print(f"Final features shape: {features_df.shape}")
features_df.head()


In [None]:
# 💾 Save to database in a new schema 'features'
with engine.connect() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS features"))
    features_df.to_sql("ad_features", engine, schema="features", if_exists="replace", index=False)

# Also save locally
features_df.to_csv("data/ad_features.csv", index=False)

print("✅ Features saved to database and CSV")


In [None]:
pd.read_sql("SELECT * FROM features.ad_features LIMIT 5", engine)
