In [None]:
# %% [markdown]
# # Fraud Project — Bootstrap (clean run)
# Charge .env, connecte à Postgres, et prépare les chargements.

# %%
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

load_dotenv()

DB_USER = os.getenv("DB_USER", os.getenv("PGUSER"))
DB_PASSWORD = os.getenv("DB_PASSWORD", "")
DB_HOST = os.getenv("DB_HOST", os.getenv("PGHOST", "localhost"))
DB_PORT = os.getenv("DB_PORT", os.getenv("PGPORT", "5432"))
DB_NAME = os.getenv("DB_NAME", os.getenv("PGDATABASE"))
KAGGLE_FILE = os.getenv("KAGGLE_FILE")  # ex: /Users/.../fraud_project/data/train_sample.csv

if not all([DB_USER, DB_HOST, DB_PORT, DB_NAME]):
    raise RuntimeError("Variables DB_* (ou PG*) manquantes dans .env")

# Construire l'URL proprement (mot de passe optionnel)
if DB_PASSWORD:
    DATABASE_URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
else:
    DATABASE_URL = f"postgresql+psycopg2://{DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

engine = create_engine(DATABASE_URL)

with engine.connect() as conn:
    v = conn.execute(text("select version()")).scalar()
print("✅ Connecté à PostgreSQL\n", v)
print("KAGGLE_FILE =", KAGGLE_FILE)


✅ Connecté à PostgreSQL
 PostgreSQL 17.5 (Postgres.app) on aarch64-apple-darwin23.6.0, compiled by Apple clang version 15.0.0 (clang-1500.3.9.4), 64-bit
KAGGLE_FILE = /Users/deborahgozlan/Documents/fraud_project/data/train_sample.csv


In [8]:
# %%
def table_count(tbl):
    with engine.connect() as conn:
        try:
            return conn.execute(text(f"SELECT COUNT(*) FROM {tbl}")).scalar()
        except Exception:
            return None

for t in ["ads", "raw_clicks", "ad_performance", "ad_connections"]:
    c = table_count(t)
    print(f"{t:15s} ->", "absente" if c is None else f"{c} lignes")

print("ℹ️ On ne recrée PAS les tables si elles existent déjà.")

ads             -> 10 lignes
raw_clicks      -> 105000 lignes
ad_performance  -> 300 lignes
ad_connections  -> 500 lignes
ℹ️ On ne recrée PAS les tables si elles existent déjà.


In [9]:
# %%
fake = Faker()

ads_rows = table_count("ads")
if ads_rows == 0:
    ads_data = [{
        "ad_id": f"AD{i:03d}",
        "advertiser": fake.company(),
        "campaign_name": f"Campaign {i}",
        "category": random.choice(["Retail", "Tech", "Finance", "Travel"]),
        "created_at": fake.date_time_between(start_date="-1y", end_date="now")
    } for i in range(1, 11)]
    pd.DataFrame(ads_data).to_sql("ads", engine, if_exists="append", index=False)
    print("✅ ads peuplée (10 lignes).")
else:
    print("⏭️ ads déjà peuplée, on passe.")


⏭️ ads déjà peuplée, on passe.


In [10]:
# %%
if not KAGGLE_FILE or not os.path.exists(KAGGLE_FILE):
    raise FileNotFoundError("KAGGLE_FILE introuvable — mets le bon chemin dans .env")

# Lecture TalkingData
df = pd.read_csv(KAGGLE_FILE)

# TalkingData: ip (int), app, device, os, click_time, attributed_time, is_attributed
# Notre schéma raw_clicks: ad_id, ip_address (INET), device_type, click_time (timestamp), referrer_url, user_agent

# 1) ad_id existants
ads_ids = pd.read_sql("SELECT ad_id FROM ads", engine)["ad_id"].tolist()
df["ad_id"] = np.random.choice(ads_ids, size=len(df))

# 2) convertir ip (int) -> IPv4 valide (pseudonyme déterministe)
def int_to_ipv4(n: int) -> str:
    n = int(n) % (2**32)
    return ".".join(str((n >> (8*i)) & 255) for i in [3,2,1,0])

df["ip_address"] = df["ip"].apply(int_to_ipv4)

# 3) device_type, click_time
df["device_type"] = df["device"].astype(str)
df["click_time"] = pd.to_datetime(df["click_time"], errors="coerce")

# 4) colonnes synthétiques
df["referrer_url"] = [fake.url() for _ in range(len(df))]
df["user_agent"]  = [fake.user_agent() for _ in range(len(df))]

df_clicks = df[["ad_id", "ip_address", "device_type", "click_time", "referrer_url", "user_agent"]].copy()

# 5) (optionnel) un peu de "messy data" contrôlée
rng = df_clicks.sample(frac=0.01, random_state=2).index
df_clicks.loc[rng, "ip_address"] = "255.255.255.255"  # IP valide mais louche

# 6) insertion : si raw_clicks déjà peuplée, on APPEND (pas de truncate ici)
before = table_count("raw_clicks") or 0
df_clicks.to_sql("raw_clicks", engine, if_exists="append", index=False)
after = table_count("raw_clicks") or 0
print(f"✅ raw_clicks peuplée (+{after - before} nouvelles lignes).")


✅ raw_clicks peuplée (+100000 nouvelles lignes).


In [11]:
# %%
rows = table_count("ad_performance")
if rows == 0:
    perf = []
    today = datetime.utcnow().date()
    for ad_id in ads_ids:
        for d in range(30):
            date_ = today - timedelta(days=d)
            impressions = random.randint(100, 5000)
            clicks = random.randint(0, impressions)
            ctr = clicks / impressions if impressions else 0
            conversions = random.randint(0, clicks)
            conv_rate = conversions / clicks if clicks else 0
            bounce_rate = random.random()
            fraud_flag = random.choice([True] + [False]*3)  # ~25%
            perf.append({
                "ad_id": ad_id,
                "date": date_,
                "impressions": impressions,
                "clicks": clicks,
                "ctr": round(ctr, 4),
                "conversions": conversions,
                "conversion_rate": round(conv_rate, 4),
                "bounce_rate": round(bounce_rate, 4),
                "fraud": fraud_flag
            })
    pd.DataFrame(perf).to_sql("ad_performance", engine, if_exists="append", index=False)
    print("✅ ad_performance générée (30j x 10 ads).")
else:
    print("⏭️ ad_performance déjà peuplée, on passe.")


⏭️ ad_performance déjà peuplée, on passe.


In [12]:
# %%
rows = table_count("ad_connections")
if rows == 0:
    conns = [{
        "ad_id": random.choice(ads_ids),
        "ip_address": f"{random.randint(1,255)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}",
        "connection_datetime": fake.date_time_between(start_date="-30d", end_date="now"),
        "email": fake.email()
    } for _ in range(500)]
    pd.DataFrame(conns).to_sql("ad_connections", engine, if_exists="append", index=False)
    print("✅ ad_connections générée (500 lignes).")
else:
    print("⏭️ ad_connections déjà peuplée, on passe.")


⏭️ ad_connections déjà peuplée, on passe.


In [15]:
# %%
fraud_sql = """
WITH ip_activity AS (
    SELECT ip_address, COUNT(*) AS total_clicks
    FROM raw_clicks
    GROUP BY ip_address
),
click_bursts AS (
    SELECT ad_id, ip_address, COUNT(*) AS clicks_in_minute
    FROM raw_clicks
    WHERE click_time >= NOW() - INTERVAL '1 minute'  -- click_time est TIMESTAMP si insertion via pandas
    GROUP BY ad_id, ip_address
),
performance_flags AS (
    SELECT ad_id, CASE WHEN ctr > 0.5 THEN 1 ELSE 0 END AS suspicious_ctr
    FROM ad_performance
)
SELECT
    rc.ad_id,
    rc.ip_address,
    COUNT(*) AS total_clicks_ip,
    MAX(pf.suspicious_ctr) AS suspicious_ctr,
    CASE WHEN ia.total_clicks > 10 THEN 'High IP usage' END AS reason_ip,
    CASE WHEN cb.clicks_in_minute > 5 THEN 'Burst clicks' END AS reason_burst,
    CASE WHEN rc.device_type IS NULL THEN 'Missing device info' END AS reason_missing_device,
    CASE WHEN rc.ip_address = '255.255.255.255' THEN 'Edge IP' END AS reason_edge_ip
FROM raw_clicks rc
LEFT JOIN ip_activity ia ON rc.ip_address = ia.ip_address
LEFT JOIN click_bursts cb ON rc.ip_address = cb.ip_address AND rc.ad_id = cb.ad_id
LEFT JOIN performance_flags pf ON rc.ad_id = pf.ad_id
GROUP BY rc.ad_id, rc.ip_address, ia.total_clicks, cb.clicks_in_minute, rc.device_type
HAVING 
    ia.total_clicks > 10
    OR cb.clicks_in_minute > 5
    OR rc.device_type IS NULL
    OR rc.ip_address = '255.255.255.255'
    OR MAX(pf.suspicious_ctr) = 1
ORDER BY total_clicks_ip DESC
LIMIT 20;
"""

preview = pd.read_sql(fraud_sql, engine)
preview.head(20)


Unnamed: 0,ad_id,ip_address,total_clicks_ip,suspicious_ctr,reason_ip,reason_burst,reason_missing_device,reason_edge_ip
0,AD009,255.255.255.255,3480,1,High IP usage,,,Edge IP
1,AD009,999.999.999.999,3330,1,High IP usage,,,
2,AD002,999.999.999.999,3240,1,High IP usage,,,
3,AD010,255.255.255.255,3210,1,High IP usage,,,Edge IP
4,AD007,999.999.999.999,3120,1,High IP usage,,,
5,AD001,999.999.999.999,3120,1,High IP usage,,,
6,AD001,255.255.255.255,3000,1,High IP usage,,,Edge IP
7,AD010,999.999.999.999,2940,1,High IP usage,,,
8,AD008,999.999.999.999,2880,1,High IP usage,,,
9,AD003,999.999.999.999,2850,1,High IP usage,,,


In [16]:
# %%
print("🚀 Notebook exécuté sans recréer les tables. Données chargées et requête OK.")


🚀 Notebook exécuté sans recréer les tables. Données chargées et requête OK.
