
<h1 align="center">Assignment 1</h1>
<h3 style="display:block; margin-top:5px;" align="center">Online Tracking and Privacy</h3>    
<br>

- Sabah Serhir Serhir

#### Step 1: Pick a website from the following list (online pharmacies and shops)


I have chosen a website from Spain www.mediamarkt.es, which is an online shop that sells technological products

#### Step 2: Capture the HTTP traffic while accepting cookies (and personal data processing)


Done in mediamarkt.es_accept.har

#### Step 3: Capture the HTTP traffic while rejecting cookies (and personal data processing)

Done in mediamarkt.es_reject.har

#### Step 4: Analyze the HAR Data

In [13]:
import subprocess
"""
This function checks wether tldextract is installed, as it is going to be used, so it does not cause a problem later
"""
def check_installation(package):
    try:
        import importlib
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        subprocess.check_call(['pip', 'install', package])
        
check_installation('tldextract')


tldextract is already installed.


In [15]:
import json
import tldextract
from datetime import datetime

"""
This function loads the har files obtained from the website 
"""
def load_file(path):
    with open(path, 'r', encoding='utf-8') as file:
        return json.load(file)

"""
This function obtains the entity_name of the organization that owns the domain names of the request urls
"""
def extract_entity_name(domain, entity_map):
    domain_info = tldextract.extract(domain)
    registered_domain = domain_info.registered_domain
    return entity_map.get(registered_domain, {"entityName": "Unknown"})["entityName"]

"""
This function obtains the domains of the urls.

"""
def extract_domain_info(url):
    domain_info = tldextract.extract(url)
    return domain_info.registered_domain

"""
This function creates a list of dictionaries containing for each request/response the first 128 characters 
of the request url, url domain, if it is a third party domain, it checks whether it has a set-cookie header
not empty and the entity name that owns the domain

"""

def extract_requests_details_from_har(har_data, entity_map):
    main_title = har_data['log']['pages'][0]['title']
    website_etld = extract_domain_info(main_title)
    requests_list = []

    for entry in har_data['log']['entries']:
        request_url = entry['request']['url']
        url_first_128_char = request_url[:128]
        url_domain = extract_domain_info(request_url)
        is_third_party = url_domain != website_etld
        has_set_cookie = any(
            header['name'].lower() == 'set-cookie' for header in entry['response']['headers']
        )
        entity_name = extract_entity_name(request_url, entity_map)

        request_details = {
            "url_first_128_char": url_first_128_char,
            "url_domain": url_domain,
            "is_third_party": is_third_party,
            "has_set_cookie": has_set_cookie,
            "entity_name": entity_name
        }
        requests_list.append(request_details)

    return requests_list


"""
This function obtains the total number of requests, the number of requests with a non-empty Cookie header,
the number of requests with a non-empty Set-Cookie header, a list of distinct third-party domains, list of 
distinct domains that set a cookie that can be used for cross-site tracking and a list of distinct entities
that own the domain of the request urls
"""
def analyze_har_file(har_data, entity_map):
    req_non_empty_cookies_count = 0
    res_non_empty_cookies_count = 0
    third_party_domains = set()
    tracker_cookie_domains = set()
    third_party_entities = []

    main_title = har_data['log']['pages'][0]['title']
    main_domain = extract_domain_info(main_title)

    for entry in har_data['log']['entries']:
        headers_request = entry['request']['headers']
        request_cookies = any(header['name'].lower() == 'cookie' for header in headers_request)
        if request_cookies:
            req_non_empty_cookies_count += 1
        headers_response = entry['response']['headers']
        response_cookies = any(
            header['name'].lower() == 'set-cookie' for header in headers_response
        )
        if response_cookies:
            res_non_empty_cookies_count += 1

        url = entry['request']['url']
        domain = extract_domain_info(url)
        if domain and domain != main_domain and domain not in third_party_domains:
            third_party_domains.add(domain)

        cookie_response = entry['response'].get('cookies', [])
        for cookie in cookie_response:
            if cookie.get('sameSite') == 'None' and cookie.get('secure'):
                expires_str = cookie.get('expires')
                if expires_str:
                    expires = datetime.strptime(expires_str, '%Y-%m-%dT%H:%M:%S.%fZ')
                    max_age = (expires - datetime.utcnow()).total_seconds() * 1000 #calculate the difference between the expirer date and actual date
                else:
                    max_age = cookie.get('maxAge') * 1000 if cookie.get('maxAge') else None
                if max_age and max_age >= 60 * 24 * 60 * 60 * 1000: # Verify if it has a minimum lifespan of 60 days converting it to miliseconds
                    tracker_cookie_domains.add(cookie.get('domain'))

        entity_name = extract_entity_name(url, entity_map)
        if entity_name != 'Unknown':
            third_party_entities.append(entity_name)

    return {
        "num_reqs": len(har_data['log']['entries']),
        "num_requests_w_cookies": req_non_empty_cookies_count,
        "num_responses_w_cookies": res_non_empty_cookies_count,
        "third_party_domains": list(third_party_domains),
        "tracker_cookie_domains": list(tracker_cookie_domains),
        "third_party_entities": list(set(third_party_entities))
    }


def main():
    # Paths to the HAR files and domain map file
    har_path_accept = 'mediamarkt.es_accept.har'
    har_path_reject = 'mediamarkt.es_reject.har'
    entity_map_path = 'domain_map.json'

    # Load domain map
    entity_map = load_file(entity_map_path)

    # Load HAR data
    har_accept = load_file(har_path_accept)
    har_reject = load_file(har_path_reject)

    # Analyze HAR files
    results_accept = analyze_har_file(har_accept, entity_map)
    results_reject = analyze_har_file(har_reject, entity_map)

    # Extract request/response details from HAR files
    requests_details_accept = extract_requests_details_from_har(har_accept, entity_map)
    requests_details_reject = extract_requests_details_from_har(har_reject, entity_map)

    # Write results to JSON files
    with open('mediamarkt.es_accept.json', 'w', encoding='utf-8') as file:
        json.dump(results_accept, file, indent=4)
        json.dump(requests_details_accept, file, indent=4)

    with open('mediamarkt.es_reject.json', 'w', encoding='utf-8') as file:
        json.dump(results_reject, file, indent=4)
        json.dump(requests_details_reject, file, indent=4)


if __name__ == "__main__":
    main()
