# Phishing Simulation & Incident Analytics — Mock Data Generator

This Colab notebook generates the same synthetic datasets found in the repo.
You can modify parameters and re-export CSVs for Tableau.

## How to Use
1. Run the cell below to generate CSVs.
2. Download `data/*.csv` and push to GitHub.
3. Mirror to Google Sheets with `=IMPORTDATA(...)` and connect Tableau Public.


In [None]:
import numpy as np, pandas as pd, random\nfrom datetime import datetime, timedelta\nimport math\nimport os\nfrom pathlib import Path\n\n# --- Parameters ---\nrandom.seed(42); np.random.seed(42)\nOUT = Path('data'); OUT.mkdir(exist_ok=True)\n\nDEPARTMENTS = [\n    'Finance','Sales','Marketing','HR','IT','Operations','Legal','Customer Support','Product','Procurement'\n]\nREGIONS = ['Ontario','Quebec','BC','Alberta','Prairies','Atlantic','US Remote']\nVECTORS = ['Email','SMS','TeamsChat','Voice','USB']\nTHEMES = [\n    'Payroll Update','Account Suspension','Package Delivery','Conference Invitation','VPN Password Reset',\n    'MFA Enrollment','Security Alert','Gift Card','Travel Itinerary','Benefits Enrollment'\n]\n\ndef random_date(start, end):\n    delta = end - start\n    return start + timedelta(seconds=random.randint(0, int(delta.total_seconds())))\n\ndef pick_weighted(items, weights):\n    return random.choices(items, weights=weights, k=1)[0]\n\n# 1) Employees\nn_employees = 600\nhire_start, hire_end = datetime(2021,1,1), datetime(2025,9,1)\nemployees=[]\nfor i in range(1, n_employees+1):\n    dept = pick_weighted(DEPARTMENTS, [12,15,13,8,20,12,5,10,10,5])\n    region = pick_weighted(REGIONS, [40,18,12,12,8,6,4])\n    hire_dt = random_date(hire_start, hire_end).date()\n    base_risk = np.clip(np.random.normal(loc={'IT':30,'Finance':45,'Sales':50,'Marketing':55,'Customer Support':60}.get(dept,50), scale=12), 5, 95)\n    employees.append((i, dept, region, hire_dt, round(float(base_risk),1)))\ndf_employees = pd.DataFrame(employees, columns=['employee_id','department','region','hire_date','baseline_risk'])\n\n# 2) Campaigns\ncampaigns=[]\nstart_campaigns = datetime(2024,4,1)\nfor m in range(18):\n    launch = start_campaigns + timedelta(days=30*m + random.randint(0,6))\n    theme = random.choice(THEMES)\n    vector = pick_weighted(VECTORS, [70,8,12,7,3])\n    difficulty = random.choice(['Easy','Medium','Hard'])\n    target_group = random.choice(['All','High-Risk','New Hires','Finance+HR','IT Only'])\n    campaigns.append((m+1, launch.date(), theme, vector, difficulty, target_group))\nfor k in range(6):\n    launch = start_campaigns + timedelta(days=random.randint(20,520))\n    campaigns.append((len(campaigns)+1, launch.date(), 'Spear-Phish: ' + random.choice(['VIP Impersonation','Invoice Fraud','CFO Urgent']), 'Email', 'Hard', random.choice(['Finance+HR','Executive Assistants','All'])))\ndf_campaigns = pd.DataFrame(campaigns, columns=['campaign_id','launch_date','theme','vector','difficulty','target_group']).sort_values('launch_date').reset_index(drop=True)\ndf_campaigns['campaign_id'] = range(1, len(df_campaigns)+1)\n\n# 3) Training\ntraining_modules = ['Phishing Basics','Advanced Phishing Tactics','Report Suspicious Emails','Spear-Phishing Awareness','Credential Theft Prevention']\ntraining_records=[]\nfor emp in df_employees.itertuples(index=False):\n    n_modules = np.random.binomial(n=3, p=max(0.2, (80-emp.baseline_risk)/100))\n    completed = np.random.choice(training_modules, size=n_modules, replace=False) if n_modules>0 else []\n    for mod in completed:\n        assign_dt = random_date(datetime(2024,1,1), datetime(2025,8,1))\n        complete_dt = assign_dt + timedelta(days=random.randint(0,20))\n        score = int(np.clip(np.random.normal(loc=85, scale=10), 40, 100))\n        training_records.append((len(training_records)+1, emp.employee_id, mod, assign_dt.date(), complete_dt.date(), score))\ndf_training = pd.DataFrame(training_records, columns=['training_id','employee_id','module_name','assign_date','complete_date','score'])\n\ndef employee_in_target(emp_row, group):\n    if group=='All': return True\n    if group=='High-Risk': return emp_row.baseline_risk>=60\n    if group=='New Hires': return pd.to_datetime(emp_row.hire_date)>=pd.Timestamp('2024-06-01')\n    if group=='Finance+HR': return emp_row.department in {'Finance','HR'}\n    if group=='IT Only': return emp_row.department=='IT'\n    if group=='Executive Assistants': return emp_row.department in {'HR','Operations'} and random.random()<0.2\n    return True\n\ntrained_emp = set(df_training['employee_id'].unique())\n\nevents=[]; eid=1\nfor camp in df_campaigns.itertuples(index=False):\n    launch_dt = pd.to_datetime(camp.launch_date)\n    audience = df_employees[df_employees.apply(lambda r: employee_in_target(r, camp.target_group), axis=1)]\n    for emp in audience.itertuples(index=False):\n        delivered_ts = launch_dt + timedelta(hours=random.randint(8,72))\n        events.append((eid, emp.employee_id, camp.campaign_id, 'delivered', delivered_ts)); eid+=1\n        base_open = {'Email':0.78,'SMS':0.65,'TeamsChat':0.72,'Voice':0.40,'USB':0.30}[camp.vector]\n        open_p = float(np.clip(np.random.normal(base_open, 0.05), 0.2, 0.98))\n        if random.random()<open_p:\n            opened_ts = delivered_ts + timedelta(minutes=random.randint(1,600))\n            events.append((eid, emp.employee_id, camp.campaign_id, 'opened', opened_ts)); eid+=1\n            diff_adj = {'Easy':-0.05,'Medium':0.0,'Hard':0.05}.get(camp.difficulty,0)\n            risk_factor = (emp.baseline_risk - 50)/100.0\n            trained_adj = -0.07 if emp.employee_id in trained_emp else 0.0\n            base_click = 0.18 + diff_adj + risk_factor + trained_adj\n            click_p = float(np.clip(base_click, 0.01, 0.60))\n            if random.random()<click_p:\n                clicked_ts = opened_ts + timedelta(minutes=int(np.random.exponential(scale=60)))\n                events.append((eid, emp.employee_id, camp.campaign_id, 'clicked', clicked_ts)); eid+=1\n                cred_p = {{'Easy':0.20,'Medium':0.28,'Hard':0.35}}[camp.difficulty]\n                cred_p = max(0.05, min(0.45, cred_p + risk_factor*0.3 - (0.05 if emp.employee_id in trained_emp else 0)))\n                if random.random()<cred_p:\n                    submit_ts = clicked_ts + timedelta(minutes=random.randint(1,30))\n                    events.append((eid, emp.employee_id, camp.campaign_id, 'submitted_credentials', submit_ts)); eid+=1\n            reported_base = 0.35 + (0.15 if emp.employee_id in trained_emp else 0.0) - (0.20 if any(e[2]==camp.campaign_id and e[1]==emp.employee_id and e[3]=='clicked' for e in events[-3:]) else 0)\n            reported_p = float(np.clip(reported_base, 0.02, 0.85))\n            if random.random()<reported_p:\n                rt_scale = 45 if emp.employee_id in trained_emp else 120\n                reported_ts = opened_ts + timedelta(minutes=int(np.random.exponential(scale=rt_scale)))\n                if reported_ts>opened_ts:\n                    events.append((eid, emp.employee_id, camp.campaign_id, 'reported', reported_ts)); eid+=1\n\ndf_events = pd.DataFrame(events, columns=['event_id','employee_id','campaign_id','event_type','timestamp']).sort_values('timestamp')\n\n# 4) Real Incidents\nsev_levels=['Low','Medium','High','Critical']; sev_weights=[0.45,0.35,0.16,0.04]\nstart_inc = pd.to_datetime(df_campaigns['launch_date'].min()) - pd.Timedelta(days=30)\nend_inc = pd.to_datetime(df_campaigns['launch_date'].max()) + pd.Timedelta(days=60)\nincident_records=[]; date_cursor=start_inc; iid=1\nDEPARTMENTS = ["Finance", "Sales", "Marketing", "HR", "IT", "Operations", "Legal", "Customer Support", "Product", "Procurement"]\nwhile date_cursor<end_inc:\n    n = np.random.poisson(lam=1.4)\n    for _ in range(n):\n        when = date_cursor + pd.to_timedelta(np.random.randint(0,7), unit='D') + pd.to_timedelta(np.random.randint(7,21), unit='h')\n        vector = np.random.choice(['Email','SMS','Web','USB','Insider'], p=[0.65,0.08,0.20,0.03,0.04])\n        dept = np.random.choice(DEPARTMENTS)\n        sev = np.random.choice(sev_levels, p=sev_weights)\n        mttr_mu = {{'Low':12,'Medium':24,'High':48,'Critical':72}}[sev]\n        mttr = max(2, int(np.random.normal(loc=mttr_mu, scale=mttr_mu*0.3)))\n        source = np.random.choice(['External','External','External','Internal'])\n        incident_records.append((iid, when.date(), vector, sev, dept, source, mttr)); iid+=1\n    date_cursor += pd.Timedelta(days=7)\ndf_incidents = pd.DataFrame(incident_records, columns=['incident_id','date','vector','severity','department','source','mttr_hours'])\n\n# Save\ndf_employees.to_csv(OUT/'employees.csv', index=False)\ndf_campaigns.to_csv(OUT/'campaigns.csv', index=False)\ndf_events.to_csv(OUT/'simulation_events.csv', index=False)\ndf_training.to_csv(OUT/'training_events.csv', index=False)\ndf_incidents.to_csv(OUT/'incidents.csv', index=False)\nprint('Wrote CSVs to ./data')\n