# Experimenting with ProbeRequest-IE Fingerprinting

### Load libraries and helper functions

In [4]:
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from scapy.all import PcapReader, Dot11Elt
import pandas as pd
import hashlib

# My library
from analyzer import ProbeRequestAnalyzer, BitAnalyzer

def plot_experiment_results(total_devices, unique_identifiers, stable_devices, unique_identifiers_count,
            stability_threshold, suitability_threshold, filename=None):
    """Generate bar chart for experiment results."""
    metrics = ['Total Devices', 'Unique Identifiers', 'Devices with only 1 identifier', 'Identifiers with only 1 device']
    values = [total_devices, unique_identifiers, stable_devices, unique_identifiers_count]
    percentages = [100, (unique_identifiers / total_devices) * 100, (stable_devices / total_devices) * 100, (unique_identifiers_count / unique_identifiers) * 100]
    
    plt.figure(figsize=(10, 6))
    plt.bar(metrics, percentages, color=['blue', 'green', 'orange', 'red'])
    plt.title(f"Identification Experiment Results\n"
        f"Stability Threshold={stability_threshold}, Suitability Threshold={suitability_threshold}")
    plt.xlabel("Metrics")
    plt.ylabel("Percentages (%)")
    
    # Display percentages on top of bars
    for i, v in enumerate(percentages):
        plt.text(i, v + 0.5, f"{v:.1f}%", ha='center', fontweight='bold')
    
    plt.tight_layout()

    # Save the plot
    base_filename = filename.split('/')[-1].split('.')[0]
    plt.savefig(f"./outputs/fingerprint_{base_filename}_sta{stability_threshold}_sui{suitability_threshold}.png")

def experiment_with_identification(analyzer: ProbeRequestAnalyzer, stability_threshold: float = 0.95, suitability_threshold: float = 0.9):
    """Run experiment with detailed analysis of identification patterns."""
    print(f"\nRunning detailed identification experiment with stability={stability_threshold}, suitability={suitability_threshold}")
    
    # Calculate suitability and create masks
    suitability_data = analyzer.bit_analyzer.calculate_suitability(stability_threshold)
    masks = analyzer.bit_analyzer.create_masks(suitability_data, suitability_threshold)

    # print mask so that it can be copied and used in another script
    print(masks)
    
    # Enhanced tracking structures
    mac_to_identifier_counts = defaultdict(lambda: defaultdict(int))  # MAC -> {identifier: count}
    identifier_to_mac_counts = defaultdict(lambda: defaultdict(int))  # identifier -> {MAC: count}
    total_packets_per_mac = defaultdict(int)  # MAC -> total_packets
    
    # Process packets
    with PcapReader(analyzer.filename) as pcap_reader:
        for packet in tqdm(pcap_reader, desc="Testing identification", unit="frames", total=analyzer.num_samples):
            if not packet.haslayer(Dot11Elt):
                continue
                
            mac_addr = packet.addr2
            if mac_addr is None:
                continue
                
            identifier = analyzer.bit_analyzer.extract_identifier(packet, masks)
            mac_to_identifier_counts[mac_addr][identifier] += 1
            identifier_to_mac_counts[identifier][mac_addr] += 1
            total_packets_per_mac[mac_addr] += 1
    
    # Create analysis tables
    mac_analysis = []
    for mac, id_counts in mac_to_identifier_counts.items():
        mac_short = mac[-5:]  # Last two hex values
        total_packets = total_packets_per_mac[mac]
        for identifier, count in id_counts.items():
            identifier_hash = hashlib.md5(identifier.encode()).hexdigest()[-4:]  # Last 4 hex values of the hash
            percentage = (count / total_packets) * 100
            mac_analysis.append({
                'MAC': mac_short,
                'Identifier': identifier_hash.upper(),
                'Occurrences': count,
                'Total_Packets': total_packets,
                'Percentage': f'{percentage:.1f}%'
            })
    
    identifier_analysis = []
    for idx, (identifier, mac_counts) in enumerate(identifier_to_mac_counts.items()):
        total_occurrences = sum(mac_counts.values())
        identifier_hash = hashlib.md5(identifier.encode()).hexdigest()[-4:]  # Last 4 hex values of the hash
        for mac, count in mac_counts.items():
            mac_short = mac[-5:]
            percentage = (count / total_occurrences) * 100
            identifier_analysis.append({
                'Identifier': identifier_hash.upper(),
                'MAC': mac_short,
                'Occurrences': count,
                'Total_Occurrences': total_occurrences,
                'Percentage': f'{percentage:.1f}%'
            })
    
    # Convert to DataFrames for better display
    mac_df = pd.DataFrame(mac_analysis)
    identifier_df = pd.DataFrame(identifier_analysis)
    
    # Filter identifiers out rare cases that appear less than 10% of the time
    total_devices = len(mac_to_identifier_counts)
    significant_identifiers = {id for id, mac_counts in identifier_to_mac_counts.items() 
                                if any(count/total_packets_per_mac[mac] >= 0.1 for mac, count in mac_counts.items())}
    unique_identifiers = len(significant_identifiers)
    stable_devices = sum(1 for mac, id_counts in mac_to_identifier_counts.items() 
                        if max(id_counts.values()) / total_packets_per_mac[mac] >= 0.9)
    unique_identifiers_count = sum(1 for id, mac_counts in identifier_to_mac_counts.items() 
                                    if len(mac_counts) == 1 and next(iter(mac_counts.values())) / total_packets_per_mac[next(iter(mac_counts.keys()))] >= 0.1)
    
    # Print summary statistics
    print("\n=== Summary Statistics ===")
    print(f"📱 Total devices: {total_devices}")
    print(f"🔑 Unique identifiers generated: {unique_identifiers}")
    print(f"🙂 Devices with stable identifier: {stable_devices} ({stable_devices/total_devices:.2%})")
    print(f"😥 Devices with multiple identifiers: {total_devices - stable_devices} ({(total_devices - stable_devices)/total_devices:.2%})")
    print(f"🙂 Identifiers matching unique device: {unique_identifiers_count} ({unique_identifiers_count/unique_identifiers:.2%})")
    print(f"😥 Identifiers matching multiple devices: {unique_identifiers - unique_identifiers_count} ({(unique_identifiers - unique_identifiers_count)/unique_identifiers:.2%})")
    
    print("\n=== MAC Address Analysis ===")
    print("Shows how many times each identifier was seen for each MAC address:")
    print(mac_df.to_string(index=False))
    
    print("\n=== Identifier Analysis ===")
    print("Shows how many times each MAC address was seen for each identifier:")
    print(identifier_df.to_string(index=False))
    
    # Plot results
    plot_experiment_results(
        total_devices, unique_identifiers, stable_devices, unique_identifiers_count,
        stability_threshold, suitability_threshold, analyzer.filename
    )
    
    return mac_to_identifier_counts, identifier_to_mac_counts

### Initialize analyzer

In [5]:
FILENAME = "./pcaps/rand_1000.pcap"
NUM_SAMPLES = 69700 # Only used for progress bar

analyzer = ProbeRequestAnalyzer(filename=FILENAME, num_samples=NUM_SAMPLES)

In [6]:
PLOT_STABILITY_THRESHOLDS = [0.9]
EXPERIMENT_STABILITY_THRESHOLDS = [0.85]
EXPERIMENT_SUITABILITY_THRESHOLDS = [0.4]


analyzer.extract_information_elements()
analyzer.render_stability_suitability_heatmaps(stability_thresholds=PLOT_STABILITY_THRESHOLDS)

# Run identification experiments with different thresholds
for stability in EXPERIMENT_STABILITY_THRESHOLDS:
    for suitability in EXPERIMENT_SUITABILITY_THRESHOLDS:
        print(f"\nRunning experiment with stability={stability}, suitability={suitability}")
        experiment_with_identification(
            stability_threshold=stability,
            suitability_threshold=suitability,
            analyzer=analyzer,
        )

Analyzing packets:   1%|▏         | 1007/69700 [00:03<03:54, 293.27frames/s]


Loaded and analyzed packets from ./pcaps/rand_1000.pcap with 22 unique MAC addresses
Saving heatmap to heatmap_sta_rand_1000
Saving heatmap to heatmap_sui0.9_rand_1000

Running experiment with stability=0.85, suitability=0.4

Running detailed identification experiment with stability=0.85, suitability=0.4
{1: [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63], 50: [2, 3, 4, 5, 9, 11, 12, 14, 17, 18, 19, 20, 25, 28, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63], 45: [0, 1, 4, 5, 6, 7, 9, 11, 15, 20, 21, 32, 33, 34, 35, 36, 37, 38, 39], 127: [12, 21, 22, 23, 24, 29, 30, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], 253: [26, 27, 31, 32, 33, 34, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,

Testing identification: 100%|██████████| 1007/1007 [00:02<00:00, 422.11frames/s]



=== Summary Statistics ===
📱 Total devices: 22
🔑 Unique identifiers generated: 27
🙂 Devices with stable identifier: 16 (72.73%)
😥 Devices with multiple identifiers: 6 (27.27%)
🙂 Identifiers matching unique device: 24 (88.89%)
😥 Identifiers matching multiple devices: 3 (11.11%)

=== MAC Address Analysis ===
Shows how many times each identifier was seen for each MAC address:
  MAC Identifier  Occurrences  Total_Packets Percentage
00:00       B527           20             20     100.0%
00:01       6403            8              9      88.9%
00:01       DF3F            1              9      11.1%
00:02       E455            3             70       4.3%
00:02       CEBA           67             70      95.7%
00:03       73EE            5              5     100.0%
00:04       3168          219            219     100.0%
00:05       FB30            2              2     100.0%
00:06       F8B0            4              4     100.0%
00:07       A412            5             11      45.5%
00:07  