In [None]:
import os
import json
import csv
import pandas as pd
import pickle
import hashlib
import base64
import math

# Define folder paths
FOLDER_PATH = "YOUR_PREPARED_DATA_DIR"
# Select last inserted fingerprint for each device
SELECTED_FINGERPRINTS = "YOUR_SELECTED_FINGERPRINTS_FILE"
# Selected fingerprints for computing stability (devices having at least 2 fingerprints)
STABILILITY_FINGERPRINTS = "YOUR_SELECTED_FINGERPRINTS_FOR_STABILITY_FILE"

def update_attribute_stats(attribute_stats, key, value):
    """Updates attribute statistics for a given key-value pair."""
    if key not in attribute_stats:
        attribute_stats[key] = {"coverage": 0, "values": set()}
    
    attribute_stats[key]["coverage"] += 1
    attribute_stats[key]["values"].add(value)

def hash_value(d, length=12):
    h = hashlib.sha256(pickle.dumps(d)).digest()
    return base64.urlsafe_b64encode(h).decode()[:length]
def load_json_file(file_path):
    """Helper function to load JSON data from a file."""
    with open(file_path, 'r') as file:
        return json.load(file)

selected_fingerprints = load_json_file(SELECTED_FINGERPRINTS)
print(len(selected_fingerprints))
stability_fingerprints = load_json_file(STABILILITY_FINGERPRINTS)
print(len(stability_fingerprints))

def compute_entropy(values, total_devices):
    """Computes Shannon entropy and normalized entropy."""
    #total = sum(values.values())
    entropy = -sum((count / total_devices) * math.log2(count / total_devices) for count in values.values() if count > 0)
    max_entropy = math.log2(total_devices)
    normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
    return entropy, normalized_entropy

def compute_unique_values(values): 
    return sum(1 for _,v in values.items() if v == 1)

def remove_redudants_from_attribute_stats(attribute_stats):
    all = {}
    filtered = {}
    for attribute, info in attribute_stats.items():
        hashed_info = hash_value(info)
        if hashed_info not in all : 
            all[hashed_info] = set()
        all[hashed_info].add(attribute)
    
    # select only first one 
    for info, attributes in all.items():
        # get first attribute 
        attribute = attribute = next(iter(attributes))
        filtered[attribute] = attribute_stats[attribute]
    
    return filtered

833
157


In [2]:
# Step 1 : Get attributes distrubution accross all fingerprints
attributes_stats = {}
total_fingerprints = 0
# Process JSON files
for filename in os.listdir(FOLDER_PATH):
    # Take one fingerprint from every device

    file_path = os.path.join(FOLDER_PATH, filename)
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        total_fingerprints += 1
        print(f"{total_fingerprints} -- {filename}")
        
        for key, value in data.items():
            update_attribute_stats(attributes_stats,key, hash_value(value))

1 -- 17b10c6c-7a0f-4635-87e4-808da519c947_1740410161317.json
2 -- b8ababa0-eeb1-4c3f-b8f6-91e4a99daa19_1728651219867.json
3 -- e1e33cfd-f37d-4f84-9ae1-16b56b049aed_1728046124292.json
4 -- 75ac12ca-e2cb-4862-bd80-aff246e1702c_1732217802507.json
5 -- 301eb4f2-a2b8-43a8-92b6-83e2efaf3cdc_1730275336433.json
6 -- bccb6bc4-21bd-40f0-ae96-cc62231eb2ab_1730310518876.json
7 -- 4f40b489-16d5-4692-92ce-8a0570832f2e_1738605800539.json
8 -- 0c442fd0-16d1-468b-9042-7d8efa52764d_1731104909094.json
9 -- ecba60f2-9eac-4d64-9033-a83d8b45b99f_1732015678945.json
10 -- 8fd6db62-8f2d-4176-98a2-b6554f1d5c6f_1732123772437.json
11 -- 8b3599ec-7aa9-45ab-b4d9-70ce5a86aa48_1741051528295.json
12 -- 96f77468-b877-4189-85d8-2654d8723b70_1729153340357.json
13 -- db5370c7-297b-47b0-b9d0-523dd35c536a_1729842749373.json
14 -- 23f44631-5c38-4334-8579-797e2a25cc66_1733504722898.json
15 -- f98f9e18-c3cd-4179-830d-7cfa58c0effa_1732394991088.json
16 -- 9fe5d66e-23f8-40c0-b21d-3daa27fd03d9_1730018371949.json
17 -- 5758d9a5-79

In [3]:
# Step 2 : Remove constant attributs and unstable across all fingerprints
def write_csv(file_path, attribute_dict):
    """Writes the attribute statistics to a CSV file."""
    with open(file_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Attribute", "Cardinality", "Coverage Fingerprints"])
        
        for attribute, info in attribute_dict.items():
            cardinality = len(info["values"])
            coverage = info["coverage"]
            
            fixed = cardinality == 1 
            unstable = cardinality == total_fingerprints

            # Ignore Fixed attributes and Attributes that are not stable on all devices
            if (fixed or unstable): 
                continue
            writer.writerow([attribute, cardinality, coverage])

# Write statistics to CSV files
print(len(attributes_stats))
file_path = f'./all_cleaned_attributes.csv'
write_csv(file_path, attributes_stats)
print(f"CSV files generated at: {file_path}")

764254
CSV files generated at: ./all_cleaned_attributes.csv


In [4]:
# Step 3 : Remove unstable attributes across all fingerprints 
# We consider attribute unstable if his value change in all devices where he apear
print(f"CSV files generated at: {f'./all_cleaned_attributes.csv'}")
cleaned_attributes_df = pd.read_csv(f'./all_cleaned_attributes.csv')
print(f"Step 1 : {len(cleaned_attributes_df)} Cleaned Attributes")
cleaned_attributes = cleaned_attributes_df["Attribute"]

def compute_attribute_changes_and_save(output_csv='./stability_cleaned_attributes.csv'):
    top_changes = {}
    for device_id, fp_paths in stability_fingerprints.items():
        if len(fp_paths) < 2:
            continue  # Not enough data to compute changes
        fp = {}
        for path in fp_paths:
            file_path = os.path.join(FOLDER_PATH, path)
            data = load_json_file(file_path)
            for attribute in cleaned_attributes:
                value = data.get(attribute,None)
                if value : 
                    if attribute not in fp: 
                        fp[attribute] = set()
                    fp[attribute].add(hash_value(value))
                    
        print(f"{device_id} -- {len(fp_paths)} -- {len(fp)}")
        # Compute changes for this device
        changes = {}
        for key,value in fp.items():
            changes[key] = len(value)-1

        # Prepare DataFrame with fingerprint_count column
        df_changes = pd.DataFrame({
            'Attribute': list(changes.keys()),
            'Change_Count': list(changes.values()),
            'Fingerprint_count': len(fp_paths)
        })
        
        for _, row in df_changes.iterrows():
                attr = row['Attribute']
                if attr not in top_changes : 
                    top_changes[attr] = {
                        "Change_values_Count": 0,
                        "Change_devices_Count": 0,
                        "Fingerprint_Count": 0,
                        "Devices_count": 0
                    }
                top_changes[attr]["Change_values_Count"] += row['Change_Count']  
                # Devices that have changed the value 
                if row['Change_Count'] > 0:
                    top_changes[attr]["Change_devices_Count"] += 1
                top_changes[attr]["Fingerprint_Count"] += row['Fingerprint_count']
                top_changes[attr]["Devices_count"] += 1
    # Build the summary DataFrame
    
    with open(output_csv, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Attribute", "Total Value Changes", "Total Device Changes","Coverage Fingerprints", "Coverage Devices", "IsStable", "IsAllStable"])
        
        for attribute, info in top_changes.items():
            # Ignore instable 
            stable = info["Change_devices_Count"] < info["Devices_count"]
            stableAll = info["Change_devices_Count"] == 0

            writer.writerow([attribute, info["Change_values_Count"], info["Change_devices_Count"], info["Fingerprint_Count"], info["Devices_count"], stable, stableAll])
    
    print(f"✅ Saved global summary to {output_csv}")
        
compute_attribute_changes_and_save()


# Get cleaned stable attributes 
stable_cleaned_attributes_df = pd.read_csv('./stability_cleaned_attributes.csv')
print(stable_cleaned_attributes_df.shape)

stable_cleaned_attributes = stable_cleaned_attributes_df[stable_cleaned_attributes_df["IsStable"] == True]["Attribute"].values
print(len(stable_cleaned_attributes))

CSV files generated at: ./all_cleaned_attributes.csv
Step 1 : 34809 Cleaned Attributes
c4e7c2f46a9828d9 -- 5 -- 21288
11604a98ac8992c4 -- 3 -- 3075
0b3c77d9ca8cabc9 -- 2 -- 17570
ee686cbfeccc5c1a -- 5 -- 21841
bb5310a7b1b340fa -- 3 -- 21348
f7e65377c5dbc163 -- 3 -- 20534
e230ddf5b610d9b0 -- 2 -- 21606
4ceca8ad395f395c -- 2 -- 3123
014256e749879cdd -- 3 -- 21831
74c4a9980196afdf -- 2 -- 20561
74769766774609c5 -- 4 -- 21459
9d4f8a5dab304707 -- 3 -- 20931
ae6921fc0592c655 -- 2 -- 3021
31417463edd94613 -- 16 -- 20912
777fac22ca38de65 -- 24 -- 2315
fffbac3d6668a8fa -- 2 -- 18249
6071525399e88e71 -- 2 -- 3097
1731ab17fac9551a -- 2 -- 3257
39708de2542d655d -- 2 -- 2887
eb3011797d96bcea -- 8 -- 21130
72f605dfb6f83c2a -- 2 -- 21679
0041881c96496c1b -- 3 -- 21919
c1e65c15e4b41b75 -- 2 -- 2887
0cefaa133a9838d9 -- 7 -- 21803
9655cb8a4749fcc7 -- 6 -- 20894
a07139523e3cafda -- 3 -- 2877
1ce9a06628090cbb -- 2 -- 19407
eb407f7de7bc3175 -- 4 -- 21821
2bca14aaca1b1372 -- 2 -- 21069
80db3118aa72d3d4 -- 5

In [5]:
# Step 4 : Remove redudant attributes having same values and distrubutions
device_ids = set()
cleaned_stable_attribute_stats = {}
def update_attribute_distrubutions(attribute_stats, key, value):
    """Updates attribute statistics for a given key-value pair."""
    if key not in attribute_stats:
        attribute_stats[key] = {"coverage": 0, "values": {}}
    
    attribute_stats[key]["coverage"] += 1
    if value not in attribute_stats[key]["values"]:
        attribute_stats[key]["values"][value] = 0
    attribute_stats[key]["values"][value] += 1
for filename in os.listdir(FOLDER_PATH):
    # Take one fingerprint from every device
    if filename in selected_fingerprints:
        file_path = os.path.join(FOLDER_PATH, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            
            # get android_id 
            device_id = data.get("content://settings/secure.android_id", "")
            device_ids.add(device_id)
            print(f"{len(device_ids)} -- {device_id}")
            
            for attribute in stable_cleaned_attributes : 
                value = data.get(attribute, None)
                if value:
                    update_attribute_distrubutions(cleaned_stable_attribute_stats, attribute, hash_value(value))

print(len(cleaned_stable_attribute_stats))                  
cleaned_stable_attribute_stats = remove_redudants_from_attribute_stats(cleaned_stable_attribute_stats)
print(len(cleaned_stable_attribute_stats))

1 -- 36ea87422b95a823
2 -- 0b3c77d9ca8cabc9
3 -- 57571ea41207757a
4 -- af88e81c4bf3409b
5 -- 6bd24a7212b8bb93
6 -- e230ddf5b610d9b0
7 -- f02d55178d365b4b
8 -- 74c4a9980196afdf
9 -- 6863f1373e74f53c
10 -- a15bd9baa35ecbca
11 -- e61964810c9d7b97
12 -- 3981dc6ef096d3d6
13 -- 810361693c989cfb
14 -- e08be7920fbec969
15 -- 74769766774609c5
16 -- 598a50664a1abf2e
17 -- bdabb162cd729a26
18 -- 147a4f59fcb85c58
19 -- 1bb70b3600154c67
20 -- a00afcaf2f9a76f4
21 -- 7b27eab0fd3d38ef
22 -- 31417463edd94613
23 -- d3baff958096f496
24 -- 049812edbb5c107a
25 -- 908fd3af35114156
26 -- a1ea709f350f8a4e
27 -- e5c2cf2dc5d17528
28 -- 2352b11e8521dbaa
29 -- 1731ab17fac9551a
30 -- bd497cebbfee3adf
31 -- 5737068164565fe7
32 -- a1cbbb4bf53bad20
33 -- 72f605dfb6f83c2a
34 -- e0ef788ff4aa0dac
35 -- 0041881c96496c1b
36 -- dd9608799425d017
37 -- 0d76a4fb88ea8196
38 -- e07c87f3ba3fef9b
39 -- fce8b61a5417a6a8
40 -- b6d64135a7168373
41 -- ff133ca40e510bc3
42 -- ba88f82dc38ed531
43 -- 1b54dc8a534d3bbe
44 -- e6c17a9db1c05d

In [6]:
# Step 5 : compute entropies and keep only ones having >0.5

def write_csv(file_path, attribute_dict):
    """Writes the attribute statistics to a CSV file."""
    with open(file_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Attribute", "Cardinality", "Unique Values","Coverage", "Shannon Entropy", "Normalized Entropy"])
        
        for attribute, info in attribute_dict.items():
            cardinality = len(info["values"])
            coverage = info["coverage"]
            unique_values = compute_unique_values(info["values"])
            
            entropy, normalized_entropy = compute_entropy(info["values"], len(device_ids))
            if normalized_entropy >= 0.5:
                writer.writerow([attribute, cardinality, unique_values, coverage, entropy, normalized_entropy])
write_csv("top_cleaned_stable_attribute_entropies.csv", cleaned_stable_attribute_stats)

In [7]:
# Step 6 : Now we compute Top cleaned and stable attributes over all devices 
# We consider attribute unstable if his value change at least in one device
# Step 3: Get cleaned stable attributes 
stable_cleaned_attributes_df = pd.read_csv('./stability_cleaned_attributes.csv')
print(stable_cleaned_attributes_df.shape)

all_stable_cleaned_attributes = stable_cleaned_attributes_df[stable_cleaned_attributes_df["IsAllStable"] == True]["Attribute"].values
print(len(all_stable_cleaned_attributes))

(32810, 7)
16175


In [8]:
# Step 6 : compute entropies and keep only ones having >0.5
device_ids = set()
cleaned_all_stable_attribute_stats = {}
def update_attribute_distrubutions(attribute_stats, key, value):
    """Updates attribute statistics for a given key-value pair."""
    if key not in attribute_stats:
        attribute_stats[key] = {"coverage": 0, "values": {}}
    
    attribute_stats[key]["coverage"] += 1
    if value not in attribute_stats[key]["values"]:
        attribute_stats[key]["values"][value] = 0
    attribute_stats[key]["values"][value] += 1
for filename in os.listdir(FOLDER_PATH):
    # Take one fingerprint from every device
    if filename in selected_fingerprints:
        file_path = os.path.join(FOLDER_PATH, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            
            # get android_id 
            device_id = data.get("content://settings/secure.android_id", "")
            device_ids.add(device_id)
            print(f"{len(device_ids)} -- {device_id}")
            
            for attribute in all_stable_cleaned_attributes : 
                value = data.get(attribute, None)
                if value:
                    update_attribute_distrubutions(cleaned_all_stable_attribute_stats, attribute, hash_value(value))
                    
cleaned_all_stable_attribute_stats = remove_redudants_from_attribute_stats(cleaned_all_stable_attribute_stats)

def write_csv(file_path, attribute_dict):
    """Writes the attribute statistics to a CSV file."""
    with open(file_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Attribute", "Cardinality", "Unique Values","Coverage", "Shannon Entropy", "Normalized Entropy"])
        
        for attribute, info in attribute_dict.items():
            cardinality = len(info["values"])
            coverage = info["coverage"]
            unique_values = compute_unique_values(info["values"])
            
            entropy, normalized_entropy = compute_entropy(info["values"], len(device_ids))
            if normalized_entropy >= 0.1:
                writer.writerow([attribute, cardinality, unique_values, coverage, entropy, normalized_entropy])

write_csv("top_cleaned_all_stable_attribute_entropies.csv", cleaned_all_stable_attribute_stats)

1 -- 36ea87422b95a823
2 -- 0b3c77d9ca8cabc9
3 -- 57571ea41207757a
4 -- af88e81c4bf3409b
5 -- 6bd24a7212b8bb93
6 -- e230ddf5b610d9b0
7 -- f02d55178d365b4b
8 -- 74c4a9980196afdf
9 -- 6863f1373e74f53c
10 -- a15bd9baa35ecbca
11 -- e61964810c9d7b97
12 -- 3981dc6ef096d3d6
13 -- 810361693c989cfb
14 -- e08be7920fbec969
15 -- 74769766774609c5
16 -- 598a50664a1abf2e
17 -- bdabb162cd729a26
18 -- 147a4f59fcb85c58
19 -- 1bb70b3600154c67
20 -- a00afcaf2f9a76f4
21 -- 7b27eab0fd3d38ef
22 -- 31417463edd94613
23 -- d3baff958096f496
24 -- 049812edbb5c107a
25 -- 908fd3af35114156
26 -- a1ea709f350f8a4e
27 -- e5c2cf2dc5d17528
28 -- 2352b11e8521dbaa
29 -- 1731ab17fac9551a
30 -- bd497cebbfee3adf
31 -- 5737068164565fe7
32 -- a1cbbb4bf53bad20
33 -- 72f605dfb6f83c2a
34 -- e0ef788ff4aa0dac
35 -- 0041881c96496c1b
36 -- dd9608799425d017
37 -- 0d76a4fb88ea8196
38 -- e07c87f3ba3fef9b
39 -- fce8b61a5417a6a8
40 -- b6d64135a7168373
41 -- ff133ca40e510bc3
42 -- ba88f82dc38ed531
43 -- 1b54dc8a534d3bbe
44 -- e6c17a9db1c05d