In [None]:
import os
import sqlite3
import pandas as pd
import json

In [None]:
prefix = "" # adjust to your location
output_prefix = ""

# database files
matching_file = "../../dataset/apps/matching.json"
database_2023 = os.path.join(prefix, "results/matching_2023.db")
database_2024 = os.path.join(prefix, "results/matching_2024.db")


In [None]:
# Dependency confusion results

pod_results_2023 = os.path.join(prefix, "results/dependency_results_2023/pod_results/")
pod_results_2024 = os.path.join(prefix, "results/dependency_results_2024/pod_results/")

# output directories
dependencies_2023_dir = os.path.join(prefix, "results/dependency_files_2023/")
dependencies_2024_dir = os.path.join(prefix, "results/dependency_files_2024/")


# output of https://github.com/visma-prodsec/confused/tree/v0.5
npm_23_log = os.path.join(prefix,"results/dependency_results_2023/npm.log")
mvn_23_log = os.path.join(prefix,"results/dependency_results_2023/mvn.log")
composer_23_log = os.path.join(prefix,"results/dependency_results_2023/composer.log")
python_23_log = os.path.join(prefix,"results/dependency_results_2023/python.log")
ruby_23_log = os.path.join(prefix,"results/dependency_results_2023/ruby.log")
ruby2_23_log = os.path.join(prefix,"results/dependency_results_2023/ruby2.log")


npm_24_log = os.path.join(prefix,"results/dependency_results_2024/npm.log")
mvn_24_log = os.path.join(prefix,"results/dependency_results_2024/mvn.log")
composer_24_log = os.path.join(prefix,"results/dependency_results_2024/composer.log")
python_24_log = os.path.join(prefix,"results/dependency_results_2024/python.log")
ruby_24_log = os.path.join(prefix,"results/dependency_results_2024/ruby.log")
ruby2_24_log = os.path.join(prefix,"results/dependency_results_2024/ruby2.log")

output_reports = os.path.join(output_prefix, "report_infos_dm.json")



In [None]:
# SQL query to select app name, platform, and file path for a given file name
query = "SELECT app_name, platform, file_path FROM apps join files on apps.id = files.app_id where file_name = ?;"

def get_infos_secret(filename, database_path):
    """
    Retrieve information about apps and their files from the database.

    Args:
        filename (str): The name of the file to search for. Supports wildcards using '*'.
        database_path (str): Path to the SQLite database.

    Returns:
        list of tuples: Each tuple contains (app_name, platform, file_path, app_id).
    """
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    # Build the SQL query, using LIKE if filename contains a wildcard
    query = """SELECT app_name, platform, file_path, apps.id FROM apps join files on apps.id = files.app_id where file_name """
    if "*" in filename:
        query += "like ?;"
        filename = filename.replace("*", "%")
    else:
        query += "= ?;"
    cursor.execute(query, (filename,))
    results = cursor.fetchall()
    conn.close()
    return results


In [None]:
def load_matching_apps(file_name):
    with open(file_name, "r") as f:
        return json.load(f)

In [None]:
cluster = {
  "Swift": {
    "CocoaPods": ["Podfile", "Podfile.lock", "*.podspec"]
    ,
    "Carthage":  ["Cartfile", "Cartfile.resolved"]
    ,
    "SwiftPM":  ["Package.swift", "Package.resolved"]
    
  },
  "Web": {
    "NPM": ["package.json", "package-lock.json"],
    "Yarn": ["yarn.lock"],
    "PNPM": ["pnpm-lock.yaml"],
    "Bun": ["bun.lockb"],
    "Bower": ["bower.json", ".bower.json"],
    "Composer": ["composer.json", "composer.lock"] 

  },
  "Python": {
    "Pip": ["requirements.txt"],
    "Pipenv": ["Pipfile", "Pipfile.lock"],
    "Poetry": ["pyproject.toml", "poetry.lock"],
    "Setuptools": ["setup.py"]
  },
  "Java": {
    "Maven": ["pom.xml"],
    "Gradle": ["build.gradle", "build.gradle.kts", "gradle.lockfile"] 
  },

  "C/C++": {
    "CMake": ["CMakeLists.txt", "*.cmake"],
    "vcpkg": ["vcpkg.json"],
    "Conan": ["conanfile.txt", "conanfile.py"],
    "Meson": ["meson.build"],
    "NuGet": ["packages.config", "*.csproj", "Directory.Packages.props"], 
    "Autotools": ["configure.in", "configure", "configure.ac"],
    "OpenSSL": ["opensslv.h"] 
  },
  "Rust": {
    "Cargo": ["Cargo.toml", "Cargo.lock"]
  },
  "Go": {
    "GoModules": ["go.mod", "go.sum"],
    "Dep": ["Gopkg.toml"]
  },
  "Ruby": {
    "Bundler": ["Gemfile", "Gemfile.lock"],
    "RubyGems": ["*.gemspec"],
    "Rake": ["Rakefile"]
  },
  "Dart": {
    "Pub": ["pubspec.yaml", "pubspec.yaml.dist", "pubspec.lock"]
  },
  "Haskell": {
    "Stack": ["stack.yaml"],
    "Cabal": ["cabal.project"]
  },
  "Perl": {
    "CPAN": ["cpanfile"]
  }
}


In [None]:
def get_number_of_files(temp_result, platform):
    """
    Count the number of files for a given platform in temp_result.

    Args:
        temp_result (list): List of tuples/lists, where each element contains file info.
        platform (str): Platform name, e.g., 'android' or 'ios'.

    Returns:
        int: Number of files matching the platform.
    """
    number_of_files = 0
    for result in temp_result:
        if result[1] == platform:
            number_of_files += 1
    return number_of_files

def get_number_of_apps(temp_result):
    """
    Get the number of unique apps per platform from temp_result.

    Args:
        temp_result (list): List of tuples/lists, where each element contains app and platform info.

    Returns:
        dict: Mapping from platform to set of unique app names.
    """
    result = {}
    for res in temp_result:
        current = result.get(res[1], set())
        current.add(res[0])
        result[res[1]] = current
    return result      

def get_number_of_cross_platform(android, ios, matching_apps):
    """
    Count the number of cross-platform apps present in both android and ios sets.

    Args:
        android (set): Set of android app names.
        ios (set): Set of ios app names.
        matching_apps (dict): Mapping from ios app name to android app name.

    Returns:
        int: Number of cross-platform apps.
    """
    result = 0
    for app in ios:
        if matching_apps.get(app) in android:
            result += 1
    return result

def get_platform_numbers(results, matching_apps, total_apps=10331, total_ios=0, total_android=0):
    """
    Calculate platform statistics: number and percentage of android, ios, and cross-platform apps.

    Args:
        results (dict): Mapping from app name to some value.
        matching_apps (dict): Mapping from ios app name to android app name.
        total_apps (int): Total number of apps.
        total_ios (int): Total number of ios apps (optional).
        total_android (int): Total number of android apps (optional).

    Returns:
        tuple: Formatted strings for android, ios, and cross-platform app counts and percentages.
    """
    android = set()
    ios = set()
    for app, v in results.items():
        if app.endswith("_android"):
            android.add(app.replace("_android", ""))
        elif app.endswith("_ios"):
            ios.add(app.replace("_ios", ""))
    if total_android == 0 and total_ios == 0:
        return (
            f"{len(android)} ({(len(android) / total_apps * 100):.2f} %)",
            f"{len(ios)} ({(len(ios) / total_apps * 100):.2f} %)",
            f"{get_number_of_cross_platform(android, ios, matching_apps)} ({(get_number_of_cross_platform(android, ios, matching_apps) / total_apps * 100):.2f} %)"
        )
    else:
        return (
            f"{len(android)} ({(len(android) / total_android * 100):.2f} %)",
            f"{len(ios)} ({(len(ios) / total_ios * 100):.2f} %)",
            f"{get_number_of_cross_platform(android, ios, matching_apps)} ({(get_number_of_cross_platform(android, ios, matching_apps) / total_ios * 100):.2f} %)"
        )

def get_stat_table(database_path, matching_apps, full=True):
    """
    Generate a statistics table (as a DataFrame) summarizing file and app counts per technology/manager.

    Args:
        database_path (str): Path to the database.
        matching_apps (dict): Mapping from ios app name to android app name.
        full (bool): If False, groups technologies with <25 files per platform as 'other'.

    Returns:
        pd.DataFrame: DataFrame with statistics per file, technology, and total.
    """
    # results: stores statistics for each file/technology/other/total
    results = {}
    all = []
    for technology, manger_dict in cluster.items():
        current_technology = []
        other = []

        for manager, files in manger_dict.items():
            for file in files:
                print(file)
                current = get_infos_secret(file, database_path)
                all.append(current)
                current_technology.append(current)
                # Group as 'other' if not enough files and full==False
                if not full and get_number_of_files(current, "android") < 25 and get_number_of_files(current, "ios") < 25:
                    other.append(current)
                    continue
                current_result = {}
                current_result["Android_files"] = get_number_of_files(current, "android")
                current_result["iOS_files"] = get_number_of_files(current, "ios")
                current_result["iOS_apps"] = len(get_number_of_apps(current).get("ios", []))
                current_result["Android_apps"] = len(get_number_of_apps(current).get("android", []))
                current_result["cross-platform"] = get_number_of_cross_platform(
                    get_number_of_apps(current).get("android", []),
                    get_number_of_apps(current).get("ios", []),
                    matching_apps
                )
                results[file] = current_result

        # Aggregate 'other' group if any
        current_result = {}
        for result in other:
            current_result["Android_files"] = get_number_of_files(result, "android") + current_result.get("Android_files", 0)
            current_result["iOS_files"] = get_number_of_files(result, "ios") + current_result.get("iOS_files", 0)
            current_result["iOS_apps"] = list(get_number_of_apps(result).get("ios", [])) + list(current_result.get("iOS_apps", []))
            current_result["Android_apps"] = list(get_number_of_apps(result).get("android", [])) + list(current_result.get("Android_apps", []))
        if len(other) > 0:
            current_result["cross-platform"] = get_number_of_cross_platform(
                current_result.get("Android_apps", []),
                current_result.get("iOS_apps", []),
                matching_apps
            )
            current_result["iOS_apps"] = len(set(current_result.get("iOS_apps", [])))
            current_result["Android_apps"] = len(set(current_result.get("Android_apps", [])))
            results[technology + "_other"] = current_result

        # Aggregate all for current technology
        current_result = {}
        for result in current_technology:
            current_result["Android_files"] = get_number_of_files(result, "android") + current_result.get("Android_files", 0)
            current_result["iOS_files"] = get_number_of_files(result, "ios") + current_result.get("iOS_files", 0)
            current_result["iOS_apps"] = list(get_number_of_apps(result).get("ios", [])) + list(current_result.get("iOS_apps", []))
            current_result["Android_apps"] = list(get_number_of_apps(result).get("android", [])) + list(current_result.get("Android_apps", []))
        if len(current_technology) > 0:
            current_result["cross-platform"] = get_number_of_cross_platform(
                current_result.get("Android_apps", []),
                current_result.get("iOS_apps", []),
                matching_apps
            )
            current_result["iOS_apps"] = len(set(current_result.get("iOS_apps", [])))
            current_result["Android_apps"] = len(set(current_result.get("Android_apps", [])))
            results[technology] = current_result

    # Aggregate totals for all technologies
    current_result = {}
    for result in all:
        current_result["Android_files"] = get_number_of_files(result, "android") + current_result.get("Android_files", 0)
        current_result["iOS_files"] = get_number_of_files(result, "ios") + current_result.get("iOS_files", 0)
        current_result["iOS_apps"] = list(get_number_of_apps(result).get("ios", [])) + list(current_result.get("iOS_apps", []))
        current_result["Android_apps"] = list(get_number_of_apps(result).get("android", [])) + list(current_result.get("Android_apps", []))
    if len(all) > 0:
        current_result["cross-platform"] = get_number_of_cross_platform(
            current_result.get("Android_apps", []),
            current_result.get("iOS_apps", []),
            matching_apps
        )
        current_result["iOS_apps"] = len(set(current_result.get("iOS_apps", [])))
        current_result["Android_apps"] = len(set(current_result.get("Android_apps", [])))
        results["Total"] = current_result

    # Convert results to DataFrame for easier analysis
    df = pd.DataFrame.from_dict(results, orient='index')
    return df

In [None]:
matching_apps = load_matching_apps(matching_file)

In [None]:
stat_2023 = get_stat_table(database_2023, matching_apps, full=False)
stat_2024  = get_stat_table(database_2024, matching_apps, full=False)

In [None]:
stat_2023 = stat_2023[(stat_2023.T != 0).any()]
stat_2024 = stat_2024[(stat_2024.T != 0).any()]




In [None]:
dependencies = stat_2023.join(stat_2024, lsuffix="_2023", rsuffix="_2024")

dependencies =  dependencies.fillna(0).astype(int)


In [None]:
total_apps = 10331
ios_2024_number = 9212
android_2024_number = 8702
# Format the values as percentages
dependencies['iOS_apps_2023'] = dependencies['iOS_apps_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f}%)")
dependencies['Android_apps_2023'] = dependencies['Android_apps_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f}%)")
dependencies['cross-platform_2023'] = dependencies['cross-platform_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f}%)")
dependencies['iOS_apps_2024'] = dependencies['iOS_apps_2024'].apply(lambda x: f"{x} ({(x / ios_2024_number * 100):.2f}%)")
dependencies['Android_apps_2024'] = dependencies['Android_apps_2024'].apply(lambda x: f"{x} ({(x / android_2024_number * 100):.2f}%)")
dependencies['cross-platform_2024'] = dependencies['cross-platform_2024'].apply(lambda x: f"{x} ({(x / ios_2024_number * 100):.2f}%)")


In [None]:
dependencies = dependencies.drop(columns=['Android_files_2023', 'iOS_files_2023', 'Android_files_2024', 'iOS_files_2024', "cross-platform_2023", "cross-platform_2024"])


In [None]:
dependencies = dependencies[['Android_apps_2023', 'iOS_apps_2023', 'Android_apps_2024', 'iOS_apps_2024']]

In [None]:
print(dependencies.to_latex())

In [None]:
stat_2023_full = get_stat_table(database_2023, matching_apps, full=True)
stat_2024_full  = get_stat_table(database_2024, matching_apps, full=True)

stat_2023_full = stat_2023_full[(stat_2023_full.T != 0).any()]
stat_2024_full = stat_2024_full[(stat_2024_full.T != 0).any()]

dependencies_full = stat_2023_full.join(stat_2024_full, lsuffix="_2023", rsuffix="_2024")

dependencies_full =  dependencies_full.fillna(0).astype(int)

total_apps = 10331
# Format the values as percentages
dependencies_full['iOS_apps_2023'] = dependencies_full['iOS_apps_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f} %)")
dependencies_full['Android_apps_2023'] = dependencies_full['Android_apps_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f} %)")
dependencies_full['cross-platform_2023'] = dependencies_full['cross-platform_2023'].apply(lambda x: f"{x} ({(x / total_apps * 100):.2f} %)")
dependencies_full['iOS_apps_2024'] = dependencies_full['iOS_apps_2024'].apply(lambda x: f"{x} ({(x / ios_2024_number * 100):.2f} %)")
dependencies_full['Android_apps_2024'] = dependencies_full['Android_apps_2024'].apply(lambda x: f"{x} ({(x / android_2024_number * 100):.2f} %)")
dependencies_full['cross-platform_2024'] = dependencies_full['cross-platform_2024'].apply(lambda x: f"{x} ({(x / ios_2024_number * 100):.2f} %)")
dependencies_full = dependencies_full.sort_index()

In [None]:
dependencies_full

In [None]:
def load_json_file(file):
    """
    Load a JSON file and return its contents.

    Args:
        file (str): Path to the JSON file.

    Returns:
        object: Parsed JSON data.
    """
    with open(file, "r") as f:
        return json.load(f)
    return []

def load_available_pod_names(folder_path):
    """
    Load pod dependencies for each app from JSON files in a folder.

    Args:
        folder_path (str): Path to the folder containing pod result files.

    Returns:
        tuple: 
            - app_to_pods (dict): Mapping from app name to set of pods.
            - pod_to_apps (dict): Mapping from pod name to set of app names.
    """
    app_to_pods = {}
    pod_to_apps = {}
    for file in os.listdir(folder_path):
        pods = load_json_file(os.path.join(folder_path, file))
        app_name = file.replace(".log", "")
        app_to_pods[app_name] = pods
        for pod in pods:
            pod_to_apps[pod] = pod_to_apps.get(pod, set())
            pod_to_apps[pod].add(app_name)
    return app_to_pods, pod_to_apps

def extract_app_name(file_path):
    """
    Extract the app name from a file path based on platform suffix.

    Args:
        file_path (str): The file path to extract from.

    Returns:
        str: The app name with platform suffix, or empty string if not found.
    """
    for part in file_path.split("/"):
        if part.endswith("_ios") or part.endswith("_android"):
            return part
    return ""

def search_for_app_name(dependency_name, path):
    """
    Search for app names that use a specific dependency by scanning files.

    Args:
        dependency_name (str): The dependency to search for.
        path (str): Directory path to search in.

    Returns:
        list: List of app names using the dependency.
    """
    results = []
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, "r", errors="ignore") as f:
                content = f.read()
                if dependency_name in content:
                    results.append(extract_app_name(file_path))
    return results

def load_confused_files(file_path, dependency_path):
    """
    Load dependencies that are marked as 'confused' and map them to apps.

    Args:
        file_path (str): Path to the log file listing confused dependencies.
        dependency_path (str): Path to the directory containing dependency files.

    Returns:
        tuple:
            - app_to_dependencies (dict): Mapping from app name to set of dependencies.
            - dependency_to_apps (dict): Mapping from dependency name to set of app names.
    """
    to_register = set()
    # Parse the log file to get confused dependency names
    with open(file_path, "r") as f:
        for line in f:
            if "[!]" in line:
                name = line.replace("[!]", "").strip()
                to_register.add(name)

    app_to_dependencies = {}
    dependency_to_apps = {}
    # For each confused dependency, find which apps use it
    for dependency in to_register:
        results = search_for_app_name(dependency, dependency_path)
        for app_name in results:
            app_to_dependencies[app_name] = app_to_dependencies.get(app_name, set())
            app_to_dependencies[app_name].add(dependency)
            dependency_to_apps[dependency] = dependency_to_apps.get(dependency, set())
            dependency_to_apps[dependency].add(app_name)
    return app_to_dependencies, dependency_to_apps

In [None]:
app_to_pod, pod_to_app =  load_available_pod_names(pod_results_2024)
app_to_pod_23, pod_to_app_23 =  load_available_pod_names(pod_results_2023)

In [None]:

npm_23_apps, npm_23_dep = load_confused_files(npm_23_log, dependencies_2023_dir)
mvn_23_apps, mvn_23_dep = load_confused_files(mvn_23_log, dependencies_2023_dir)
composer_23_apps, composer_23_dep = load_confused_files(composer_23_log, dependencies_2023_dir)
python_23_apps, python_23_dep = load_confused_files(python_23_log, dependencies_2023_dir)
ruby_23_apps, ruby_23_dep = load_confused_files(ruby_23_log, dependencies_2023_dir)
ruby2_23_apps, ruby2_23_dep = load_confused_files(ruby2_23_log, dependencies_2023_dir)


In [None]:


npm_24_apps, npm_24_dep = load_confused_files(npm_24_log, dependencies_2024_dir)
mvn_24_apps, mvn_24_dep = load_confused_files(mvn_24_log, dependencies_2024_dir)
composer_24_apps, composer_24_dep = load_confused_files(composer_24_log, dependencies_2024_dir)
python_24_apps, python_24_dep = load_confused_files(python_24_log, dependencies_2024_dir)
ruby_24_apps, ruby_24_dep = load_confused_files(ruby_24_log, dependencies_2024_dir)
ruby2_24_apps, ruby2_24_dep = load_confused_files(ruby2_24_log, dependencies_2024_dir)


In [None]:
print(get_platform_numbers(npm_24_apps, matching_apps))
print(get_platform_numbers(mvn_24_apps, matching_apps))
print(get_platform_numbers(composer_24_apps, matching_apps))
print(get_platform_numbers(python_24_apps, matching_apps))
print(get_platform_numbers(ruby_24_apps, matching_apps))
print(get_platform_numbers(ruby2_24_apps, matching_apps))
print(get_platform_numbers(app_to_pod, matching_apps))

In [None]:
print(get_platform_numbers(npm_23_apps, matching_apps))
print(get_platform_numbers(mvn_23_apps, matching_apps))
print(get_platform_numbers(composer_23_apps, matching_apps))
print(get_platform_numbers(python_23_apps, matching_apps))
print(get_platform_numbers(ruby_23_apps, matching_apps))
print(get_platform_numbers(ruby2_23_apps, matching_apps))
print(get_platform_numbers(app_to_pod_23, matching_apps))

In [None]:
npm_23_apps

In [None]:
len(npm_23_dep)

In [None]:
len(pod_to_app_23)

In [None]:
app_to_pod

In [None]:
def create_report_file(database_path):
    result = {}
    for tech, types in cluster.items():
        if tech == "Java":
            continue
        for _, files in types.items():
            for file in files:
                current = get_infos_secret(file, database_path)
                if current:
                    current_result = []
                    for app_name, platform, file_path, id in current:
                        current_result.append([id, platform, file_path])
                    result[file] = list(current_result)
    return result
        

In [None]:
report_infos = create_report_file(database_2024)

In [None]:
with open(output_reports, "w") as f:
    json.dump(report_infos, f, indent=4)
    print("Report file created successfully.")