In [2]:
import json
import sqlite3
import os
import pandas as pd
import jwt
import importlib
import statistics

importlib.reload(jwt)
from scipy import stats
import researchpy
import plotly
importlib.reload(plotly)
import plotly.graph_objects as go


In [None]:
prefix = "" # adjust to your location

db_2023 = os.path.join(prefix, "results/matching_2023.db")
db_2024 = os.path.join(prefix, "results/matching_2024.db")

verified_secrets = os.path.join(prefix,"results/verified/")
re_evaluated_secrets = os.path.join(prefix,"results/re_evaluated/")

android_metadata_path = os.path.join(prefix,"results/dataset/android_metadata.json")
matching_file_path = os.path.join(prefix,"results/dataset/matching.json")

In [None]:
# exported from the evaluate metadata notebook

code_2023_path = os.path.join(prefix,"results/metadata/code_2023.json")
code_2024_path = os.path.join(prefix,"results/metadata/code_2024.json")
scripts_2023_path = os.path.join(prefix,"results/metadata/scripts_2023.json")
scripts_2024_path = os.path.join(prefix,"results/metadata/scripts_2024.json")
podfile_2023_path = os.path.join(prefix,"results/metadata/podfile_2023.json")
podfile_2024_path = os.path.join(prefix,"results/metadata/podfile_2024.json")
hidden_directory_2023_path = os.path.join(prefix,"results/metadata/hidden_directory_stat_2023.json")
hidden_directory_2024_path = os.path.join(prefix,"results/metadata/hidden_directory_stat_2024.json")
hidden_files_2023_path = os.path.join(prefix,"results/metadata/hidden_files_2023.json")
hidden_files_2024_path = os.path.join(prefix,"results/metadata/hidden_files_2024.json")


In [None]:
output_dir = ""

In [None]:
from typing import List


class Secret:
    def __init__(self):
        self.apps: List[AppInfo] = []
        self.files = []
        self.verified = False
        self.reason = ""
        self.secret = ""
        self.detector = ""
        self.full_details = []
        self.file_paths = set()

class AppInfo:
    def __init__(self, name="", platform=""):
        self.name = name
        self.platform = platform

    def __str__(self):
        return f"AppInfo(name={self.name}, platform={self.platform})"

    def __repr__(self):
        return self.__str__()
    
    def __eq__(self, other):
        if isinstance(other, AppInfo):
            return self.name == other.name and self.platform == other.platform
        return False
    
    def __hash__(self):
        return hash((self.name, self.platform))


In [None]:
def load_json(file_path):
    with open(file_path) as f:
        return json.load(f)
    



In [None]:
def get_infos_secret(secret, database_path):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    query = """
    SELECT apps.app_name, apps.platform, files.file_name, secrets.secret, files.file_path FROM apps JOIN files on apps.id = files.app_id JOIN secrets on secrets.file_id = files.id where secrets.secret like ?;
    """
    cursor.execute(query, ("%" + secret + "%",))
    results = cursor.fetchall()
    if len(results) == 0:
        cursor.execute(query, ("%" + secret.replace(";-|", "") + "%",))
        results = cursor.fetchall()
    conn.close()
    return results


def get_all_apps(database_path):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    query = """
    SELECT app_name, platform FROM apps;
    """
    cursor.execute(query)
    results = cursor.fetchall()
    conn.close()
    result_list = []
    for result in results:
        result_list.append(AppInfo(name=result[0], platform=result[1]))
    return result_list

def get_all_infos(database_path):
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    query = """
    SELECT apps.app_name, apps.platform, files.file_name, secrets.secret, files.file_path FROM apps JOIN files on apps.id = files.app_id JOIN secrets on secrets.file_id = files.id;
    """
    cursor.execute(query)
    results = cursor.fetchall()
    conn.close()
    return results

def get_private_key_secrets(database):
    results = get_all_infos(database)
    private_key_secrets = {}
    for info in results:
        json_secret = json.loads(info[3])
        if json_secret.get("DetectorType", -1) == 15:
            current = private_key_secrets.get(json_secret["Raw"], Secret())
            current.secret = json_secret["Raw"]
            current.detector = "Private Key"
            current.files.append(info[2])
            current.apps.append(AppInfo(name=info[0], platform=info[1]))
            private_key_secrets[json_secret["Raw"]] = current
            current.verified = True # They are not really verified but for simplicity later we set it to verified

    for secret, secret_info in private_key_secrets.items():
        secret_info.apps = list(set(secret_info.apps))
        secret_info.files = list(set(secret_info.files))
    return private_key_secrets

def get_jwt_secrets(database):
    results = get_all_infos(database)
    jwt_secrets = {}
    for info in results:
        json_secret = json.loads(info[3])
        if json_secret.get("DetectorDescription","") == "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.":
            current = jwt_secrets.get(json_secret["Raw"], Secret())
            current.secret = json_secret["Raw"]
            try:

                decoded = jwt.decode(current.secret, options={"verify_signature": False})
                
            except:
                print("exception")
                continue
            current.detector = "JWT"
            current.files.append(info[2])
            current.apps.append(AppInfo(name=info[0], platform=info[1]))
            current.verified = True # They are not really verified but for simplicity later we set it to verified
            jwt_secrets[json_secret["Raw"]] = current

    for secret, secret_info in jwt_secrets.items():
        secret_info.apps = list(set(secret_info.apps))
        secret_info.files = list(set(secret_info.files))
    return jwt_secrets

def prepare_trufflehog(secret):
    raw = secret["Raw"].strip()
    if secret.get("RawV2", "") != "" and secret.get("DetectorType", -1) != 17:
        raw2 = secret["RawV2"].strip()
        if ";-|" in raw2:
            return raw2
        else:
            part2 = raw2.replace(raw, "")
            new_raw =  raw + ";-|"  + part2
            if new_raw.endswith(";-|"):
                new_raw = new_raw[:-3]
            return new_raw
    return raw

def create_trufflehog_dict(infos):
    results = {}
    for info in infos:
        secret_info = prepare_trufflehog(json.loads(info[3]))
        current = results.get(secret_info, [])
        current.append(info)
        results[secret_info] = current
    return results



def add_infos_to_secrets(results, database_path):
    all_infos = create_trufflehog_dict(get_all_infos(database_path))
    for secret,current_info in results.items():
        for info in all_infos.get(secret, []):
            current_app = AppInfo()
            current_app.name = info[0]
            current_app.platform = info[1]
            current_info.apps.append(current_app)
            
            current_info.files.append(info[2])
            current_info.full_details.append(json.loads(info[3]))
            current_info.file_paths.add(info[4])

    return results


def post_process(secrets, database_path, remove=False):
    secrets = add_infos_to_secrets(secrets, database_path)
    print(len(secrets))
    results = {}
    for secret, secret_info in secrets.items():
        if remove:
            if len(secret_info.full_details) == 0:
                continue
            if (secret_info.full_details[0].get("DetectorDescription","") == "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data." or 
                secret_info.full_details[0].get("DetectorType", -1) in [1031, 1021, 1010, 1012, 1025, 1002, 727, 15] or 
                secret_info.full_details[0].get("DetectorDescription", "") == "Identified an Email address." ):
                continue
        if len(secret_info.apps) > 0:
            secret_info.apps = list(set(secret_info.apps))
            secret_info.files = list(set(secret_info.files))
            results[secret] = secret_info
    return results


def join_dict(results_1, results_2, only_verified=False):
    result = {}
    for k,v in results_1.items():
        if only_verified and v.verified:
            result[k] = v
        elif not only_verified:
            result[k] = v

    for k,v in results_2.items():
        if only_verified and  v.verified and k not in result:
            result[k] = v
        elif not only_verified and k not in result:
            result[k] = v

    return result


In [None]:
def get_unique_secrets(all_secrets):
    results_secrets = {}
    rule_ids = {}
    unique_per_file = {}
    # apps.app_name, apps.platform, files.file_name, secrets.secret, files.file_path
    for name, platform, file_name, secret, file_path in all_secrets:
        appid = f"{name}_{platform}"
        json_secret = json.loads(secret)
        secret = prepare_trufflehog(json_secret)


        detector_type = json_secret.get("DetectorType", "")
        if str(detector_type) in ["1002", "727", "1021", "1025", "1012", "1021", "1031", "1010", "4", "1008"]:
            continue
        if (json_secret.get("DetectorDescription","") == "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data." or 
            json_secret.get("DetectorType", -1) in [1031, 1021, 1010, 1012, 1025, 1002, 727, 15] or 
            json_secret.get("DetectorDescription", "") == "Identified an Email address." ):
            continue


        rule_ids[secret] = detector_type
        tmp = results_secrets.get(secret, set())
        tmp.add(appid)
        results_secrets[secret] = tmp
        
        findings = unique_per_file.get(file_path, {})
        tmp_findings = findings.get(detector_type, set())
        tmp_findings.add(secret)
        findings[detector_type] = tmp_findings
        unique_per_file[file_path] = findings

    return results_secrets, rule_ids, unique_per_file




In [None]:
def get_results(folder_path):
    result = {}
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), "r") as f:
            data = json.load(f)
            if "verified" not in data:
                continue
            if data.get("detector", 0) in ["1031", "1021", "1010", "1012", "1025", "1002", "727"]:
                continue

            current_secret = Secret()
            current_secret.secret = data["secret"]
            current_secret.verified = data["verified"]
            current_secret.reason = data.get("reason", "trufflehog")
            current_secret.detector = data["detector"]

            result[current_secret.secret] = current_secret

    return result

In [None]:

all_apps_2023 = get_all_apps(db_2023)
all_apps_2024 = get_all_apps(db_2024)

In [None]:
private_keys_2023 = get_private_key_secrets(db_2023)
private_keys_2024 = get_private_key_secrets(db_2024)

In [None]:
private_keys_both = join_dict(private_keys_2023, private_keys_2024)

In [None]:
jwt_2023 = get_jwt_secrets(db_2023)
jwt_2024 = get_jwt_secrets(db_2024)

In [None]:
jwt_both = join_dict(jwt_2023, jwt_2024)

In [None]:
all_results = get_results(verified_secrets)
results_2023 = post_process(all_results, db_2023, remove=True)

In [None]:
all_results_after_disclosure = get_results(re_evaluated_secrets)
results_2023_after_disclosure = post_process(all_results_after_disclosure, db_2023, remove=True)
results_2024_after_disclosure = post_process(all_results_after_disclosure, db_2024, remove=True)

In [None]:
all_results = get_results(verified_secrets)
results_2024 = post_process(all_results, db_2024, remove=True)

In [None]:
results_2023_android = {k:v for k,v in results_2023.items() for app in v.apps if app.platform == "android"}
results_2023_ios = {k:v for k,v in results_2023.items() for app in v.apps if app.platform == "ios"}
results_2024_android = {k:v for k,v in results_2024.items() for app in v.apps if app.platform == "android"}
results_2024_ios = {k:v for k,v in results_2024.items() for app in v.apps if app.platform == "ios"}

In [None]:
results_both_verified = join_dict(results_2023, results_2024, only_verified=True)
results_both = join_dict(results_2023, results_2024)
results_both_android = {k:v for k,v in results_both.items() for app in v.apps if app.platform == "android"}
results_both_ios = {k:v for k,v in results_both.items() for app in v.apps if app.platform == "ios"}

In [None]:
results_both_verified_retest = join_dict(results_2023_after_disclosure, results_2024_after_disclosure, only_verified=True)


In [None]:
len(results_both_verified_retest)

In [None]:
len(results_both_verified) - len(results_both_verified_retest)

In [None]:
(len(results_both_verified) - len(results_both_verified_retest)) / len(results_both_verified) * 100

In [None]:
100 - (len(results_both_verified) - len(results_both_verified_retest)) / len(results_both_verified) * 100

In [None]:
len(results_both)

In [None]:
def get_detector_map(results, verified = True):
    valid_secrets = {}
    for secret, secret_info in results.items():
        if secret_info.verified is verified:
            detector_name = secret_info.full_details[0].get("DetectorName", "")
            current = valid_secrets.get(detector_name, [])
            current.append(secret)
            valid_secrets[detector_name] = current
    return valid_secrets


def get_only_secrets_by_detector_name(results, names, verified = True):
    valid_secrets = {}
    for secret, secret_info in results.items():
        if secret_info.verified is verified:
            detector_name = secret_info.full_details[0].get("DetectorName", "")
            if detector_name in names:
                valid_secrets[secret] = secret_info
    return valid_secrets

In [None]:
def create_stat_arrays_platfroms(results, android_ids):
    android= {}
    ios = {}
    for k, v in results.items():
        if v.verified:
            for app in v.apps:
                if app.platform == "android":
                    current = android.get(app.name, [])
                    current.append(k)
                    android[app.name] = current
                else:
                    matching_name = matching[app.name]
                    current = ios.get(matching_name, [])
                    current.append(k)
                    ios[matching_name] = current

    for app in android_ids:
        if app not in android:
            android[app] = []
        if app not in ios:
            ios[app] = []

    for k,v in android.items():
        android[k] = len(v)
    for k,v in ios.items():
        ios[k] = len(v)
    
    df = pd.DataFrame({'Android': android, 'iOS': ios})
    df.fillna(0, inplace=True)
    df = df.astype(int)
    return df

def get_dttest_platforms(results, all_android_ids):
    df = create_stat_arrays_platfroms(results, all_android_ids)
    des,res = researchpy.ttest(df['iOS'], df['Android'], paired=True)
    return res.loc[3]['results'], res.loc[6]["results"] #stats.ttest_rel(df['Android'], df['iOS'])

def create_tmp_array(current_result, android_ids, matching_apps):
    to_return = {}
    for k,v in current_result.items():
        if v.verified:
            for app in v.apps:
                current = to_return.get(app.name + "_" + app.platform, [])
                current.append(k)
                to_return[app.name + "_" + app.platform] = current
    tmp = {}
    for k,v in to_return.items():
        tmp[k] = len(v)
    android_id_strings = [x.name + "_android" for x in android_ids]
    for k,v in matching_apps.items():
        if v in android_id_strings:
            if k+"_ios" not in tmp:
                tmp[k+"_ios"] = 0
            if v + "_android" not in tmp:
                tmp[v + "_android"] = 0
    return tmp



def create_stat_array_years(results_1, results_2, android_ids, matching_apps, only_Android = True, only_iOS = False):
    tmp_1 = create_tmp_array(results_1, android_ids, matching_apps)
    tmp_2 = create_tmp_array(results_2, android_ids, matching_apps)


    if only_Android:
        tmp_1 = {k:v for k,v in tmp_1.items() if k.endswith("_android")}
        tmp_2 = {k:v for k,v in tmp_2.items() if k.endswith("_android")}
    if only_iOS:
        tmp_1 = {k:v for k,v in tmp_1.items() if k.endswith("_ios")}
        tmp_2 = {k:v for k,v in tmp_2.items() if k.endswith("_ios")}

    result = {"results_1": tmp_1 , "results_2": tmp_2}
    df = pd.DataFrame(result)
    df.fillna(0, inplace=True)
    df = df.astype(int)
    return df
                


def get_dttest_years(results_1, results_2, android_ids, matching_apps, only_Android, only_iOS):
    df = create_stat_array_years(results_1, results_2, android_ids,matching_apps, only_Android=only_Android, only_iOS=only_iOS)
    des,res = researchpy.ttest(df['results_1'], df['results_2'], paired=True)
    #print(res)
    return res.loc[3]['results'], res.loc[6]["results"] #stats.ttest_rel(df['Android'], df['iOS'])



In [None]:
def get_stats_detector(results, verified = True, only_selected = None, categories = None):
    valid = get_detector_map(results, True)
    invalid = get_detector_map(results, False)
    stats = {}

    for key in set(valid.keys()).union(set(invalid.keys())):
        if verified and key not in valid :
            continue
        if only_selected is not None and key not in only_selected:
            continue

        stats[key] = {"valid": len(valid.get(key,[])), "invalid": len(invalid.get(key, [])), "percentage": f"{((len(valid.get(key,[])) / (len(valid.get(key, [])) + len(invalid.get(key, []))))*100):.2f}%"}

    if categories != None:
        for k, v in categories.items():
            current = {"valid": 0, "invalid": 0}
            for value in v:
                if value not in stats:
                    continue
                current["valid"] += stats[value].get("valid", 0)
                current["invalid"] += stats[value].get("invalid", 0)
                del stats[value]

            current["percentage"] = f"{((current['valid'] / (current['valid'] + current['invalid']))*100):.2f}%"
            stats[k] = current
                
    return pd.DataFrame(stats)


In [None]:
def get_stats_detector_platform(results, matching_apps, verified = True, only_selected = None, categories = None, app_ids =None):
    results_android = {k:v for k,v in results.items() for app in v.apps if app.platform == "android"}
    results_ios = {k:v for k,v in results.items() for app in v.apps if app.platform == "ios"}
    results_both = {k:v for k,v in results.items() for app in v.apps if app.platform == "ios" and AppInfo(matching_apps[app.name], "android") in v.apps}

    valid_android = get_detector_map(results_android, True)
    invalid_android = get_detector_map(results_android, False)
    valid_ios = get_detector_map(results_ios, True)
    invalid_ios = get_detector_map(results_ios, False)
    valid_both = get_detector_map(results_both, True)
    invalid_both = get_detector_map(results_both, False)

    stats = {}
    all_keys = set(valid_both.keys()).union(set(invalid_both.keys())).union(valid_android.keys()).union(invalid_android.keys()).union(valid_ios.keys()).union(invalid_ios.keys())
    for key in all_keys:
        if verified and key not in valid_both and key not in valid_android and key not in valid_ios:
            continue
        if only_selected is not None and key not in only_selected:
            continue

        stats[key] = {}
        if key in valid_android or key in invalid_android:
            stats[key] = {"valid_android": len(valid_android.get(key,[])), "invalid_android": len(invalid_android.get(key, [])), "percentage_android": f"{((len(valid_android.get(key,[])) / (len(invalid_android.get(key, [])) + len(valid_android.get(key, []))))*100):.2f}%"}
        if key in valid_ios or key in invalid_ios:
            stats[key] = stats[key]| {"valid_ios": len(valid_ios.get(key,[])), "invalid_ios": len(invalid_ios.get(key, [])), "percentage_ios": f"{((len(valid_ios.get(key,[])) / (len(invalid_ios.get(key, [])) + len(valid_ios.get(key, []))))*100):.2f}%"}
        if key in valid_both or key in invalid_both:
            stats[key] = stats[key]|{"valid_both": len(valid_both.get(key,[])), "invalid_both": len(invalid_both.get(key, [])), "percentage_both": f"{((len(valid_both.get(key,[])) / (len(invalid_both.get(key, [])) + len(valid_both.get(key, []))))*100):.2f}%"}
   
    if categories != None:
        for k, v in categories.items():
            current = {"valid_android": 0, "invalid_android": 0, "valid_ios": 0, "invalid_ios": 0, "valid_both": 0, "invalid_both": 0}
            for value in v:
                if value not in stats:
                    continue
                current["valid_android"] += stats[value].get("valid_android", 0)
                current["invalid_android"] += stats[value].get("invalid_android", 0)
                current["valid_ios"] += stats[value].get("valid_ios", 0)
                current["invalid_ios"] += stats[value].get("invalid_ios", 0)
                current["valid_both"] += stats[value].get("valid_both", 0)
                current["invalid_both"] += stats[value].get("invalid_both", 0)
                del stats[value]


            current["percentage_android"] = f"{((current['valid_android'] / (current['valid_android'] + current['invalid_android']))*100):.2f}%"
            current["percentage_ios"] = f"{((current['valid_ios'] / (current['valid_ios'] + current['invalid_ios']))*100):.2f}%"
            current["percentage_both"] = f"{((current['valid_both'] / (current['valid_both'] + current['invalid_both']))*100):.2f}%"
            
            stats[k] = current

    
    if app_ids != None:
        for k,v in stats.items():
            print(k)
            if categories and  k in categories:
                detector_results = get_only_secrets_by_detector_name(results, categories[k], verified = True)
                #print(k)
                #print(detector_results)
                ttest_result = get_dttest_platforms(detector_results, app_ids)
                print(ttest_result)
            else:
                detector_results = get_only_secrets_by_detector_name(results, [k], verified = True)

                #print(k)
                #print(detector_results)

                ttest_result = get_dttest_platforms(detector_results, app_ids)
                print(ttest_result)
            if ttest_result[0] < 0.01:
                stats[k]["ttest_p"] = "<0.01"
            else:
                stats[k]["ttest_p"] = f"{ttest_result[0]:.2f}"
            stats[k]["ttest_statistic"] = f"{ttest_result[1]:.2F}"

    df = pd.DataFrame(stats).transpose().sort_index().fillna(0)

    df.loc["total"] = df.select_dtypes(exclude=['object']).sum(axis=0)
    df = df.apply(lambda x: x.astype(int) if x.dtype == 'float' else x)
                
    return df

In [None]:
print("total_credentials")
all = get_stats_detector(results_both, verified=False).transpose().sum()
print(all["valid"] + all["invalid"])

total_extracted = 10328

print("total categories")
print(len(get_stats_detector(results_both, verified=False).transpose()))

print("total valid")
print(all["valid"])
print(f"{all['valid'] / total_extracted * 100:.2f}%")

print("total categories verified")
print(len(get_stats_detector(results_both, verified=True).transpose()))

In [None]:
get_stats_detector(results_both, verified=False)

In [None]:

high_risk = ["Alibaba", "Github", "Yelp", "Squareup", "AWS",  "RazorPay", "Azure",  "Stripe", "GCP", "FTP", "Dockerhub"]
medium_risk = ["Alchemy", "Vercel", "Moralis", "ZendeskApi", "DatadogToken","SendGrid", "BoxOauth", "AutoPilot", "Crowdin", "SentryToken", "URI", "PubNubPublishKey", "PubNubSubscriptionKey", "BrowserStack", "Honeycomb", "Slack", "FourSquare", "MediaStack", "Mailgun", "LaunchDarkly", "HuggingFace", "Statuspage", "PrivateKey", "CustomerIO", "Mailchimp", "Notion", "TrelloApiKey", "Disqus", "LokaliseToken", "Box", "Freshdesk", "Twilio"]
low_risk = ["Tomtom", "Pixabay", "ZipCodeAPI", "BitLyAccessToken", "OpenWeather", "HereAPI", "Graphhopper", "TwitterConsumerkey", "Flickr", "ExchangeRatesAPI", "LocationIQ", "WorldWeather", "IpStack", "Unsplash", "WeatherStack", "OpenAI", "SlackWebhook", "YoutubeApiKey", "IPGeolocation", "MaxMindLicense", "Geoapify", "VisualCrossing", "Replicate", "Infura", "Etherscan","BscScan",]

In [None]:
print(get_stats_detector(results_both, verified=False).transpose().sort_index().to_latex())


In [None]:
get_stats_detector(results_both, categories={"Other": medium_risk + low_risk}).transpose().sort_index()


In [None]:
def compare_two_results(result_1, result_2):

    df_android = get_stats_detector(result_1).transpose().sort_index()
    df_ios = get_stats_detector(result_2).transpose().sort_index()

    # Remove the 'invalid' column from both dataframes
    df_android = df_android.drop(columns=['invalid'])
    df_ios = df_ios.drop(columns=['invalid'])

    # Calculate the total number of valid credentials for each platform
    total_valid_android = df_android['valid'].sum()
    total_valid_ios = df_ios['valid'].sum()

    # Add the percentage column
    df_android['percentage_v2'] = (df_android['valid'] / total_valid_android) * 100
    df_ios['percentage_v2'] = (df_ios['valid'] / total_valid_ios) * 100
    # Format the percentage_v2 column
    df_android['percentage_v2'] = df_android['percentage_v2'].map('{:.2f} %'.format)
    df_ios['percentage_v2'] = df_ios['percentage_v2'].map('{:.2f} %'.format)

    # Merge the two dataframes
    merged_df = pd.merge(df_android, df_ios, left_index=True, right_index=True, suffixes=('_result_1', '_result_2'))
    return merged_df


In [None]:
matching = load_json(matching_file_path)

In [None]:
all_without_selected = low_risk +medium_risk + high_risk 
all_without_selected.remove("Infura")
all_without_selected.remove("OpenAI")
all_without_selected.remove("OpenWeather")
all_without_selected.remove("SlackWebhook")
all_without_selected.remove("AWS")
all_without_selected.remove("Github")
all_without_selected.remove("Flickr")

#Note: 3 bitbucket = 2 android and 1 ios; jenkins = both
# android + 3
# ios + 2
# both + 1


stats_both_platforms = get_stats_detector_platform(results_both, matching, categories={"Other": all_without_selected}, app_ids=all_apps_2023).sort_index()
stats_both_platforms.drop(columns=[col for col in stats_both_platforms.columns if col.startswith('invalid')], inplace=True)
stats_both_platforms.drop(columns=[col for col in stats_both_platforms.columns if col.startswith('percentage_')], inplace=True)
print(stats_both_platforms.to_latex())


In [None]:
print("Total")
get_dttest_platforms(results_both, all_apps_2023)


In [None]:
stats_both_platforms

In [None]:
print(stats_both_platforms.to_latex())

In [None]:
print(get_stats_detector_platform(results_both, matching).to_latex())


In [None]:
compare_two_results(results_2023, results_2024)

In [None]:
len(medium_risk + low_risk)

In [None]:
print(get_stats_detector(results_both, categories={"Other": medium_risk + low_risk}).transpose().sort_index().to_latex())


In [None]:
get_stats_detector(results_both).transpose().to_csv(os.path.join(output_dir, "valid_services.csv"))

In [None]:
get_stats_detector(results_2024).transpose()

In [None]:
from typing import Dict


def stats_verify(results):
    valid = 0
    invalid_trufflehog = 0
    invalid_llama = 0
    for secret, secret_info in results.items():
        if secret_info.verified:
            valid += 1
        else:
            if secret_info.reason == "trufflehog":
                invalid_trufflehog += 1
            else:
                invalid_llama += 1
    return {"valid": valid, "invalid_trufflehog": invalid_trufflehog, "invalid_llama": invalid_llama}

def compare_results(results1, results2):
    all_keys = set(results1.keys()).union(set(results2.keys()))

    stats = {}
    for key in all_keys:
        if key in results1 and key in results2:
            if results1[key].verified or results2[key].verified:
                stats["both_verified"] = stats.get("both_verified", 0) + 1
            elif not results1[key].verified or not results2[key].verified:
                stats["both_not_verified"] = stats.get("both_not_verified", 0) + 1
        elif key in results1 and key not in results2:
            if results1[key].verified:
                stats["results_1_verified"] = stats.get("results_1_verified", 0) + 1
            elif not results1[key].verified :
                stats["results_1_not_verified"] = stats.get("results_1_not_verified", 0) + 1
        elif key not in results1 and key in results2:
            if results2[key].verified:
                stats["results_2_verified"] = stats.get("results_2_verified", 0) + 1
            elif not results2[key].verified:
                stats["results_2_not_verified"] = stats.get("results_2_not_verified", 0) + 1
        else:
            print("Error")
    return stats


def compare_android_and_ios(results: Dict[str,Secret], matching_apps):
    result = {"both": 0, "only_android": 0, "only_ios": 0}
    for secret, secret_info in results.items():
        if not secret_info.verified:
            continue

        cross_platform = []
        for app in secret_info.apps:
            if app.platform == "ios":
                current = AppInfo()
                current.name = matching_apps.get(app.name, '')
                current.platform = "android"
                if current in secret_info.apps:
                    cross_platform.append(current)
                    cross_platform.append(app)

        result["both"] += int(len(cross_platform) / 2)
        for app in secret_info.apps:
            if app in cross_platform:
                continue
            if app.platform == "ios":
                result["only_ios"] += 1
            elif app.platform == "android":
                result["only_android"] += 1

    result["android"] = result["only_android"] + result["both"]
    result["ios"] = result["only_ios"] + result["both"]
    return result


def get_app_stats(results):
    result_dict = {"android": set(), "ios": set()}
    for secret, secret_info in results.items():
        if not secret_info.verified:
            continue
        for app in secret_info.apps:
            if app.platform == "android":
                result_dict["android"].add(app.name)
            elif app.platform == "ios":
                result_dict["ios"].add(app.name)

    for key, value in result_dict.items():
        result_dict[key] = len(value)
    return result_dict

def get_number_of_apps_with_secret(results, matching_apps, remove = False):
    result = {}
    for secret, secret_info in results.items():
        if not secret_info.verified:
            continue
        count = len(secret_info.apps)
        if not remove:
            for app in secret_info.apps:
                if app.platform == "ios":
                    current = AppInfo()
                    current.name = matching_apps.get(app.name, '')
                    current.platform = "android"
                    if current in secret_info.apps:
                        count -= 1
        result[secret] = count
    return result


def count_diff(result_1, result_2):
    count = 0
    for k, v in result_1.items():
        count = count + result_1.get(k, 0) - result_2.get(k, 0)
    return count
        
def get_formated_results(results, total = 10331):
    formatted = {}
    if total is None or total == 0:
        total = 0
        for k,v in results.items():
            total += v

    for k,v in results.items():
        if total == 0:
            continue
        formatted[k] = f"{v} ({(v/total)*100:.2f}%)"
    return formatted

def get_android_ios_app_stats(results, matching_apps):
    result = {"android": set(), "ios": set(), "both": []}
    for secret, secret_info in results.items():
        if not secret_info.verified:
            continue
        for app in secret_info.apps:
            if app.platform == "android":
                result["android"].add(app.name)
            elif app.platform == "ios":
                result["ios"].add(app.name)
        
    for app in result["ios"]:
        if matching_apps[app] in result["android"]:
            result["both"].append(1)

    result = {k: len(v) for k,v in result.items()}
    result["only_ios"] = result["ios"] - result["both"]
    result["only_android"] = result["android"] - result["both"]

    return result



def get_stat_values(result):
    all = []
    more_than_one = 0 
    for r, v in result.items():
        all.append(v)
        if v> 1:
            more_than_one += 1
    
    return statistics.mean(all), statistics.median(all), statistics.stdev(all), statistics.variance(all), max(all), more_than_one




In [None]:
get_formated_results(stats_verify(results_both), None)

In [None]:
stats_verify(results_2023)["valid"] + stats_verify(results_2023)["invalid_trufflehog"] + stats_verify(results_2023)["invalid_llama"]

In [None]:
get_stats_detector(results_2023, False).to_csv(os.path.join(output_dir,"2023_invalid.csv"))

In [None]:
get_stats_detector(results_2023)

In [None]:
sum(stats_verify(results_2023).values())

In [None]:
sum(stats_verify(results_2024).values())

In [None]:
get_stats_detector(results_2024, False)

In [None]:
get_formated_results(stats_verify(results_both_android), None)

In [None]:
get_formated_results(stats_verify(results_both_ios), None)

In [None]:
compare_results(results_both_android, results_both_ios)

In [None]:
get_formated_results(stats_verify(results_both), None)

In [None]:
get_formated_results(stats_verify(results_2024), None)

In [None]:
compare_results(results_2023, results_2024)

In [None]:
# Compare is number of credentials 
compare_android_and_ios(results_2024, matching)

In [None]:
compare_android_and_ios(results_2023, matching)

In [None]:
compare_android_and_ios(results_both, matching)

In [None]:
get_formated_results(get_android_ios_app_stats(results_both,matching))

In [None]:
#Multiple apps 
# statistics.mean(all), statistics.median(all), statistics.stdev(all), statistics.variance(all), max(all), more_than_one
get_stat_values(get_number_of_apps_with_secret(results_both, matching, remove=False))

In [None]:
get_stat_values(get_number_of_apps_with_secret(results_2023, matching, remove=True))

In [None]:

get_stat_values(get_number_of_apps_with_secret(results_2023, matching, remove=False))

In [None]:
def sort_dict_desc(dictionary):
    return dict(sorted(dictionary.items(), key=lambda item: item[1], reverse=True))


def sort_dict_keys_desc(dictionary):
    return dict(sorted(dictionary.items(), key=lambda item: item[0], reverse=True))


sort_dict_desc(get_number_of_apps_with_secret(results_2023, matching))

In [None]:
sort_dict_desc(get_number_of_apps_with_secret(results_2024, matching))


In [None]:
all_results = get_results("/to_verify_v2")
for k,v in all_results.items():
    if k not in results_2023 and k not in results_2024 and v.verified:
        print(k)
        print(v)

In [None]:
android_metadata = load_json(android_metadata_path)


In [None]:
def get_installation_id(app, platform):
    if platform.lower() == "ios":
        return int(android_metadata[matching[app]].get("minInstal", 10))
    else:
        return int(android_metadata[app].get("minInstal", 10))
    
def get_all_installation_stats(all_apps):
    tmp = []
    for app in all_apps:
        if app.platform == "android":
            tmp.append(get_installation_id(app.name, "android"))
    above_100_mio = 0 
    for result in tmp:
        if result >= 100000000:
            above_100_mio += 1


    return {"avg": statistics.mean(tmp), "median": statistics.median(tmp), "stdev": statistics.stdev(tmp), "variance": statistics.variance(tmp), "mad": float(stats.median_abs_deviation(tmp)), "max": max(tmp), "min": min(tmp), "100_mio": above_100_mio}


In [None]:
def get_secrets_per_android_app_id(results, valid= True):
    secrets = {}
    for secret, secret_info in results.items():
        if secret_info.verified is not valid:
            continue

        for app in secret_info.apps:
            detector_name = (secret_info.full_details[0].get("DetectorName", ""))

            if app.platform.lower() == "android":
                current = secrets.get(app.name, {})
                current_secrets = current.get(detector_name, set())
                current_secrets.add(secret)
                current[detector_name] = current_secrets
                secrets[app.name] = current
            else:
                current = secrets.get(matching[app.name], {})
                current_secrets = current.get(detector_name, set())
                current_secrets.add(secret)
                current[detector_name] = current_secrets
                secrets[matching[app.name]] = current
    return secrets


def get_secrets_per_app_id(results, valid= True):
    secrets = {}
    for secret, secret_info in results.items():
        if secret_info.verified is not valid:
            continue
        for app in secret_info.apps:
            detector_name = (secret_info.full_details[0].get("DetectorName", ""))
            current = secrets.get(app.name + "_" + app.platform, {})
            current_secrets = current.get(detector_name, set())
            current_secrets.add(secret)
            current[detector_name] = current_secrets
            secrets[app.name + "_" + app.platform] = current
    return secrets


def get_per_app_id(results, valid= True):
    secrets = {}
    for secret, secret_info in results.items():
        if secret_info.verified is not valid:
            continue
        for app in secret_info.apps:
            current = secrets.get(app.name + "_" + app.platform, [])
            current.append(secret)
            secrets[app.name + "_" + app.platform] = current
    return secrets

def create_installation_table_platform(results, already_per_app = False):
    if already_per_app:
        per_app = results
    else:
        per_app = get_per_app_id(results)
    installation = {"android": {}, "ios": {}, "both": {}}
    all_android_ids = set()

    for app, secrets in per_app.items():
        if app.endswith("_ios"):
            name = app[:-4]
            installation_count = get_installation_id(name, "ios")
            ios = installation["ios"]
            ios[installation_count] = ios.get(installation_count, 0) + 1
            installation["ios"] = ios
            all_android_ids.add(matching[name])

        else:
            name = app[:-8]
            installation_count = get_installation_id(name, "android")
            android = installation["android"]
            android[installation_count] = android.get(installation_count, 0) + 1
            installation["android"] = android
            all_android_ids.add(name)

    for app in all_android_ids:
        installation_count = get_installation_id(app, "android")
        both = installation["both"]
        both[installation_count] = both.get(installation_count, 0) + 1
        installation["both"] = both

    for k,v in installation.items():
        installation[k] = sort_dict_keys_desc(v)
        
    return installation



def only_per_android_id(result):
    to_return = {}
    for k,v in result.items():
        if k.endswith("_android"):
            to_return[k] = v
        else:
            name = k[:-4]
            android = matching[name]
            to_return[f"{android}_android"] = v
    return to_return



def create_installation_table_years(result_1, result_2, already_per_app = False):
    # TODO: merge - cross-platform yes or no? currently yes
    if already_per_app:
        per_app_1 = result_1
        per_app_2 = result_2
    else:
        per_app_1 = get_per_app_id(result_1)
        per_app_2 = get_per_app_id(result_2)
    per_app_1 = only_per_android_id(per_app_1)
    per_app_2 = only_per_android_id(per_app_2)
    installation = {"2023": {}, "2024": {}}
    for app, secrets in per_app_1.items():

        if app.endswith("_ios"):
            name = app[:-4]
            installation_count = get_installation_id(name, "ios")
        else:
            name = app[:-8]
            installation_count = get_installation_id(name, "android")
        current = installation["2023"]
        current[installation_count] = current.get(installation_count, 0) + 1
        installation["2023"] = current

    for app, secrets in per_app_2.items():

        if app.endswith("_ios"):
            name = app[:-4]
            installation_count = get_installation_id(name, "ios")
        else:
            name = app[:-8]
            installation_count = get_installation_id(name, "android")
        current = installation["2024"]
        current[installation_count] = current.get(installation_count, 0) + 1
        installation["2024"] = current
        

    for k,v in installation.items():
        installation[k] = sort_dict_keys_desc(v)
    return installation



def get_dict_installation(results):
    secrets = get_secrets_per_android_app_id(results)
    installation = {}
    for app, secret in secrets.items():
        #if "gitlab" not in str(secret) and "Github" not in str(secret) and "@bitbucket" not in str(secret) and "jenkins" not in str(secret):
        #    #print(secret)
        #    continue
        #else:
        #    print(secret)
        tmp = get_installation_id(app, "android")
        if tmp == None:
            tmp = -1
        current_install = int(tmp)
        current = installation.get(current_install, set())
        current.add(app)
        installation[current_install] = current
    return installation

def get_all_android_installation_ids_above(all_apps, min_installation=100000000):
    installation = set()
    for app in all_apps:
        if app.platform == "ios":
            continue
        tmp = get_installation_id(app.name, app.platform)
        if tmp == None:
            tmp = -1
        current_install = int(tmp)
        if current_install >= min_installation:
            installation.add(app)
    return installation



def get_verified_credentials_above_installation(results, min_installation= 100000000):
    to_return = {}

    for info, secret in results.items():
        for app in secret.apps:
            installation = get_installation_id(app.name, app.platform)
            if installation == None:
                installation = -1
            if installation >= min_installation:
                to_return[info] = secret

    return to_return


def get_number_secrets_per_app_id(results):
    result_new = {}
    for app, secrets in results.items():
        count = 0 
        for detector, secret in secrets.items():
            count += len(secret)
        result_new[app] = count
    return result_new







In [None]:
android_app_above_100_mio = get_all_android_installation_ids_above(all_apps_2023)


In [None]:
get_secrets_per_app_id(results_both_verified)

In [None]:
get_stat_values(get_number_secrets_per_app_id(get_secrets_per_app_id(results_both)))

In [None]:
get_secrets_per_android_app_id(results_2023)

In [None]:
get_dict_installation(results_2024)

In [None]:
sort_dict_desc(get_number_of_apps_with_secret(jwt_2023, matching))


In [None]:
sort_dict_desc(get_number_of_apps_with_secret(jwt_2024, matching))


In [None]:

sort_dict_desc(get_number_of_apps_with_secret(private_keys_2023, matching))


In [None]:
sort_dict_desc(get_number_of_apps_with_secret(private_keys_2024, matching))


In [None]:
compare_results(private_keys_2023, private_keys_2024)

In [None]:
compare_results(jwt_2023, jwt_2024)

In [None]:
stats_verify(private_keys_both)

In [None]:
stats_verify(private_keys_2023)

In [None]:
stats_verify(private_keys_2024)

In [None]:
stats_verify(jwt_2023)

In [None]:
stats_verify(jwt_2024)

In [None]:
stats_verify(jwt_both)

In [None]:
get_formated_results(get_app_stats(private_keys_both))

In [None]:
get_formated_results(get_app_stats(private_keys_2023))

In [None]:
get_formated_results(get_app_stats(jwt_both))

In [None]:

jwt_both_without_unity_token = {k:v for k,v in jwt_both.items()}
jwt_both_without_unity_token.pop("eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHAiOjc3LCJhcHBMZXZlbENvcHBhIjp0cnVlLCJhdHQiOjAsImF1YyI6MjE4LCJhdWkiOjI0OSwiYXpwIjoiNTAwNjA5MWEtZWRjOC00ODExLTlmZTAtMTE1YTgwZjA1YThhIiwiY2FsY3VsYXRlZENvcHBhIjp0cnVlLCJjb25zZW50IjpmYWxzZSwiY29udGV4dHVhbE9ubHkiOmZhbHNlLCJjcGkiOjU1LCJjcmVhdGVkIjoxNjM2MDQzMzQ3MDAwLCJkbHQiOjAsImR0IjowLCJldHQiOlsyMTQ3NDgzOTQ5LDUzNDksNzM0Niw1MzQ1LDg0MDcsODg5MSw3NjUxLDU3NTYsODg2MSw4NTUyXSwiZXhwIjoxNjY4MTY2MDYxLCJpYXAiOjEzLCJpYXQiOjE2NjY5NTY0NjEsImlzcyI6ImFkcy1zZGstY29uZmlndXJhdGlvbi51bml0eWFkcy51bml0eTNkLmNvbSIsImxlZ2FsVGVycml0b3J5IjoxLCJsdHYiOjEyOCwibWl4ZWQiOmZhbHNlLCJwcm8iOjExNSwicHJveGllZCI6ZmFsc2UsInNzIjoiZWFiZWFhMjktMDM1Ni00MjIzLWJjYWYtYTU0ZGM4NDAxMDFkIiwic3ViIjoibVN0UkVqZEYyZDFVWmFiSW1xUUt6NG5BM2kzTU1mMEpNMy9XOG9xcXJyMWdSWk95dGdvOXhTR1JYMEl4YmtKU1l4ZTZSZz09IiwidGd0IjoxNTksInhwciI6Mn0.zz9nRbt5z8HF6_3RvY9EdV8YQkTY1F_UbyAYGI9pmok")
get_formated_results(get_app_stats(jwt_both_without_unity_token))

In [None]:
get_formated_results(get_app_stats(jwt_2023))

In [None]:
get_formated_results(get_app_stats(jwt_2024))

In [None]:
get_formated_results(get_app_stats(results_2023))

In [None]:
get_app_stats(results_2024)

In [None]:
get_app_stats(private_keys_2024)

In [None]:
def more_than_x(results, x = 1):
    count = 0
    for secret, c in results.items():
        if c > x:
            count += 1
    return count




In [None]:
print(more_than_x(get_number_of_apps_with_secret(private_keys_both, matching)))
print(more_than_x(get_number_of_apps_with_secret(private_keys_both, matching), x = 100))

In [None]:
print(more_than_x(get_number_of_apps_with_secret(private_keys_2023, matching)))
print(more_than_x(get_number_of_apps_with_secret(private_keys_2023, matching), x = 100))

In [None]:
more_than_x(get_number_of_apps_with_secret(private_keys_2024, matching))

In [None]:
import time

def jwt_token_stats(results, debug = False):
    result = {"no_exp": 0, "long_valid":0 , "short_valid": 0, "expired": 0, 'admin': 0, 'admin_expired': 0, 'total': 0}
    for secret, secret_info in results.items():
        result["total"] += 1

        try:
            decoded = jwt.decode(secret, options={"verify_signature": False})
            if debug:
                print(secret)
                print(decoded)
                print(secret_info.apps)
                print(secret_info.files)
                if "exp" in decoded and int(decoded["exp"]) < int(time.time()):
                    print("Expired")
            if "admin" in str(decoded):
                result["admin"] += 1
                if "exp" in decoded and decoded["exp"] < int(time.time()):
                    result["admin_expired"] += 1
            if "exp" not in decoded:
                result["no_exp"] += 1
            else:
                if int(decoded["exp"]) < int(time.time()):
                    result["expired"] += 1
                elif int(decoded["exp"]) < int(time.time()) + 6 * 30 * 24 * 60 * 60:
                    result["short_valid"] += 1
                elif int(decoded["exp"]) >  int(time.time())  + 10 * 12 * 30 * 24 * 60 * 60:
                    result["long_valid"] += 1
        except ValueError:
            continue


    return result

In [None]:
get_formated_results(jwt_token_stats(jwt_both),jwt_token_stats(jwt_both)["total"])

In [None]:
len(jwt_2023)

In [None]:
jwt_token_stats(jwt_2023, debug=True)

In [None]:
print(more_than_x(get_number_of_apps_with_secret(jwt_both, matching), x = 1))
print(more_than_x(get_number_of_apps_with_secret(jwt_both, matching), x = 100))

In [None]:
sorted_jwt_both = sort_dict_desc(get_number_of_apps_with_secret(jwt_both, matching))
sorted_jwt_both

In [None]:
def categorize_file(file):
    if file.endswith(".dex") or file.endswith(".so"):
        return "Binary"
    elif file.endswith(".arsc") or file.endswith("resources.assets") :
        return "Resources"
    elif file.endswith(".jsbundle") or "index.android.bundle" == file or file.endswith(".js") or file.endswith("js.map"):
        return "Web"
    elif "." not in file:
        return "Binary"
    elif file.endswith("Info.plist"):
        return "Resources"
    elif file.endswith(".xcconfig") or file.endswith(".swift") or file.endswith(".java") or file.endswith(".sh") or file == ".gitlab-ci.yml":
        print(file)
        return "Unintended Code"
    elif file.endswith(".json") or file.endswith(".xml") or file.endswith(".properties") or file.endswith(".config") or file.endswith(".bundle") or file == "resource.corona-archive" or file.endswith(".plist"):

        return "Configuration"
    elif file == "global-metadata.dat" or file == "INTLConfig.ini" or file.endswith("unity3d") or file == "MSDKConfig.ini" or file == "Settings.rb" or file == "netease_global.data" or file == "login.cfg" or file.endswith(".dart"):

        return "Library"
    else: #Manual checked before
        return "Binary"




def classify_files(results, format= False):
    result = {}
    resultsAndroid = {}
    resultsiOS = {}
    result_differents = {}
    for secret, secret_info in results.items():
        if not secret_info.verified:
            continue
        
        current = set()
        current_android =set()
        current_ios = set()
        for file in secret_info.file_paths:
            filename = os.path.basename(file)
            current.add(categorize_file(filename))
            if "_ios" in file:
                current_ios.add(categorize_file(filename))
            else:
                current_android.add(categorize_file(filename))


        for category in current:
            result[category] = result.get(category, 0) + 1
        for category in current_android:
            resultsAndroid[category] = resultsAndroid.get(category, 0) + 1
        for category in current_ios:
            resultsiOS[category] = resultsiOS.get(category, 0) + 1
        if len(current_android) > 0 and len(current_ios) > 0:
            result_differents["total"] = result_differents.get("total", 0) + 1
            if current_android == current_ios:
                result_differents["both"] = result_differents.get("both", 0) + 1
            else:
                result_differents["different"] = result_differents.get("different", 0) + 1
                print("different android and ios")

        result["total"] = result.get("total", 0) + 1
        if len(current_android) > 0:
            resultsAndroid["total"] = resultsAndroid.get("total", 0) + 1
        if len(current_ios) > 0:
            resultsiOS["total"] = resultsiOS.get("total", 0) + 1
    
    result = sort_dict_desc(result)
    resultsAndroid = sort_dict_desc(resultsAndroid)
    resultsiOS = sort_dict_desc(resultsiOS)
    
    if format:
        return get_formated_results(result, total=result["total"]), get_formated_results(resultsAndroid, total=resultsAndroid["total"]), get_formated_results(resultsiOS, total=resultsiOS["total"]), get_formated_results(result_differents, total=result_differents["total"])

    return result, resultsAndroid, resultsiOS, result_differents

In [None]:
def create_classified_file_table(results, format = False):
    all, android, ios, different = classify_files(results, format=format)
    all = sort_dict_desc(all)
    

    return pd.DataFrame([android, ios,all], index=["Android", "iOS","Both"]).transpose()

In [None]:
classify_files(results_both, format=True)

In [None]:
_,_,_, different = classify_files(results_both, format=True)

In [None]:
different

In [None]:
create_classified_file_table(results_both, format=True)

In [None]:
print(create_classified_file_table(results_both, format=True).to_latex())

In [None]:
create_classified_file_table(results_2023, format=True)

In [None]:
create_classified_file_table(results_2024, format=True)

In [None]:
classify_files(results_both, format=True)

In [None]:
classify_files(results_2023, format=True)

In [None]:
def cacluate_stats(df_result):
    result = {}
    for column in df_result.columns:
        current = []
        for idx, value in df_result[column].items():
            for i in range(value):
                current.append(idx)
        if len(current) == 0:
            continue
        current_result = {"avg": statistics.mean(current), "median": statistics.median(current), "stdev": statistics.stdev(current), "variance": statistics.variance(current), "mad": float(stats.median_abs_deviation(current))} 
        result[column] = current_result
    return result

def create_table_installation_counts(results, total):
    # finding_name -> category -> count - > number
    category_table = {}
    for finding, category_map in results.items():
        for category, count_map in category_map.items():
            current = category_table.get(category, {})
            current_finding = current.get(finding, {})
            for count, number in count_map.items():
                current_finding[count] = current_finding.get(count, 0) + number
            current[finding] = current_finding
            category_table[category] = current
    names = []
    all_tables = []
    for category, finding_map in category_table.items():
        all_tables.append(pd.DataFrame(finding_map))
        names.append(category)

    merged_df = pd.concat(all_tables, keys=names, axis=1)
    merged_df = merged_df.fillna(0)
    merged_df = merged_df.astype(int)
    merged_df = merged_df.sort_index(ascending=False)
    tmp =  merged_df.loc[merged_df.index.astype(int) < 1000].sum()
    merged_df = merged_df.drop(merged_df[merged_df.index.astype(int) < 1000].index)
    merged_df.loc[0] = tmp
    print(cacluate_stats(merged_df))


    merged_df.index = merged_df.index.map(lambda x: f"{x:,}+")

    merged_df.loc['Total'] = merged_df.sum()

    #merged_df["0-999"] = tmp

    merged_df = merged_df.map(lambda x: f"{x} ({(x/total)*100:.2f}%)")
    return merged_df


In [None]:

code_23 = load_json(code_2023_path)
code_24 = load_json(code_2024_path)
script_23 = load_json(scripts_2023_path)
script_24 = load_json(scripts_2024_path)
podfiles_23 = load_json(podfile_2023_path)
podfiles_24 = load_json(podfile_2024_path)
hidden_directory_23 = load_json(hidden_directory_2023_path)
hidden_directory_24 = load_json(hidden_directory_2024_path)
hidden_files_23 = load_json(hidden_files_2023_path)
hidden_files_24 = load_json(hidden_files_2024_path)


In [None]:
def create_table_platforms(result, matching, df_name, total_android = 0, total_ios = 0):
    android = []
    ios = []
    matching_apps = []
    for k,v in result.items():
        if k.endswith("_ios"):
            name = k[:-4]
            ios.append(name)
        else:
            name = k[:-8]
            android.append(name)
    for app in ios:
        if matching[app] in android:
            matching_apps.append(app)
    if total_android == 0 and total_ios == 0:
        test = get_formated_results({"Android": len(android), "iOS": len(ios), "Matching": len(matching_apps)})
    else:
        test = get_formated_results({"iOS": len(ios), "Matching": len(matching_apps)}, total=total_ios)
        test.update(get_formated_results({"Android": len(android)}, total=total_android))

    
    return pd.DataFrame(test, index=[df_name])

def comparison_years(result_1, result_2, matching, name, total_android = 0, total_ios = 0):
    output_1 = create_table_platforms(result_1,matching, name)
    output_2 = create_table_platforms(result_2, matching, name, total_android=total_android, total_ios=total_ios)
    return pd.concat([output_1, output_2], axis=1)


def comparison_years_concat(results, matching, names, total_android = 0, total_ios = 0):
    total = None
    i = 0
    for result in results:

        current =comparison_years(result[0], result[1],matching, names[i], total_android=total_android, total_ios=total_ios)
        i = i + 1
        if total is None:
            total = current
        else:
            total = pd.concat([total, current], axis=0)
    return total


In [None]:
years_concat = comparison_years_concat([(code_23, code_24), (script_23, script_24), (podfiles_23, podfiles_24), (hidden_directory_23, hidden_directory_24), (hidden_files_23, hidden_files_24)], matching, ["Code", "Scripts", "Podfiles", "Hidden Directories", "Hidden Files"], total_android= 8702, total_ios= 9212)

In [None]:
years_concat

In [None]:
years_concat.drop(columns=['Matching'], inplace=True)


In [None]:
years_concat

In [None]:
print(years_concat.to_latex())

In [None]:
install_table_code = create_installation_table_platform(join_dict(code_23, code_24), already_per_app=True)
install_table_script = create_installation_table_platform(join_dict(script_23, script_24), already_per_app=True)
install_table_podfiles = create_installation_table_platform(join_dict(podfiles_23, podfiles_24), already_per_app=True)
install_table_secrets = create_installation_table_platform(results_both)
install_table_jwt = create_installation_table_platform(jwt_both)
install_table_pk = create_installation_table_platform(private_keys_both)
# "Code": install_table_code, "Script": install_table_script, "Podfiles": install_table_podfiles,
install_map = { "Secrets": install_table_secrets, "JWT": install_table_jwt, "Private Keys": install_table_pk}

In [None]:
install_table_code_2023 = create_installation_table_platform(code_23, already_per_app=True)
install_table_script_2023 = create_installation_table_platform(script_23, already_per_app=True)
install_table_podfiles_2023 = create_installation_table_platform(podfiles_23, already_per_app=True)
install_table_secrets_2023 = create_installation_table_platform(results_2023)
install_table_jwt_2023 = create_installation_table_platform(jwt_2023)
install_table_pk_2023 = create_installation_table_platform(private_keys_2023)
# "Code": install_table_code, "Script": install_table_script, "Podfiles": install_table_podfiles,
install_map_2023 = { "Secrets": install_table_secrets_2023, "JWT": install_table_jwt_2023, "Private Keys": install_table_pk_2023}


install_table_code_2024 = create_installation_table_platform(code_24, already_per_app=True)
install_table_script_2024 = create_installation_table_platform(script_24, already_per_app=True)
install_table_podfiles_2024 = create_installation_table_platform(podfiles_24, already_per_app=True)
install_table_secrets_2024 = create_installation_table_platform(results_2024)
install_table_jwt_2024 = create_installation_table_platform(jwt_2024)
install_table_pk_2024 = create_installation_table_platform(private_keys_2024)
# "Code": install_table_code, "Script": install_table_script, "Podfiles": install_table_podfiles,
install_map_2024 = { "Secrets": install_table_secrets_2024, "JWT": install_table_jwt_2024, "Private Keys": install_table_pk_2024}

In [None]:
install_table_code_year = create_installation_table_years(code_23, code_24, already_per_app=True)
install_table_script_year = create_installation_table_years(script_23, script_24, already_per_app=True)
install_table_podfiles_year = create_installation_table_years(podfiles_23, podfiles_24, already_per_app=True)
install_table_secrets_year = create_installation_table_years(results_2023, results_2024) 
install_table_jwt_year = create_installation_table_years(jwt_2023, jwt_2024)
install_table_pk_year = create_installation_table_years(private_keys_2023, private_keys_2024)
install_map_year = {"Code": install_table_code_year, "Script": install_table_script_year, "Podfiles": install_table_podfiles_year, "Secrets": install_table_secrets_year, "JWT": install_table_jwt_year, "Private Keys": install_table_pk_year}

In [None]:
get_all_installation_stats(all_apps_2023)

In [None]:
get_all_installation_stats(all_apps_2024)

In [None]:
install_count_table = create_table_installation_counts(install_map, 10331)

In [None]:
install_count_table

In [None]:
install_count_table.drop(columns=[("android", 'JWT'), ("android", 'Private Keys'), ("ios", 'JWT'), ("ios", 'Private Keys'), ("both", 'JWT'), ("both", 'Private Keys')], inplace=True)

print(install_count_table.to_latex())

In [None]:
install_count_table.to_csv(os.path.join(output_dir,"install_count_table.csv"))

In [None]:
install_db_year = create_table_installation_counts(install_map_year, 10331)

In [None]:
print(install_db_year.to_latex())

In [None]:
install_db_year

In [None]:
above_100_mio_2023 = get_verified_credentials_above_installation(results_2023)
above_100_mio_2023_verified = {k:v for k,v in above_100_mio_2023.items() if v.verified}
get_dttest_platforms(above_100_mio_2023_verified, android_app_above_100_mio)


In [None]:
above_100_mio_2024 = get_verified_credentials_above_installation(results_2024)
above_100_mio_2024_verified = {k:v for k,v in above_100_mio_2024.items() if v.verified}
get_dttest_platforms(above_100_mio_2024_verified, android_app_above_100_mio)


In [None]:
above_100_mio = get_verified_credentials_above_installation(results_2023)
above_100_mio_verified = {k:v for k,v in above_100_mio.items() if v.verified}

In [None]:
above_100_mio_24 = get_verified_credentials_above_installation(results_2024)
above_100_mio_verified_24 = {k:v for k,v in above_100_mio_24.items() if v.verified}

In [None]:
get_dttest_years(above_100_mio_verified, above_100_mio_verified_24, android_app_above_100_mio, matching, only_Android=True, only_iOS=False)

In [None]:
get_dttest_years(above_100_mio_verified, above_100_mio_verified_24, android_app_above_100_mio, matching, only_Android=False, only_iOS=True)

In [None]:
get_dttest_years(above_100_mio_verified, above_100_mio_verified_24, android_app_above_100_mio, matching, only_Android=False, only_iOS=False)

In [None]:
get_dttest_platforms(above_100_mio_verified, android_app_above_100_mio)


In [None]:
def create_install_map_above_100(current_install_map):
    result = {}
    for key, value in current_install_map.items():
        result[key] = {}
        for platform, installs in value.items():
            result[key][platform] = {k: v for k, v in installs.items() if k >= 100000000}
    return result



In [None]:
install_map_above_100_mio = create_install_map_above_100(install_map)

In [None]:
install_count_table = create_table_installation_counts(install_map_above_100_mio, 464)

In [None]:
install_count_table

In [None]:
install_map_above_100_mio_23 = create_install_map_above_100(install_map_2023)
install_count_table_23 = create_table_installation_counts(install_map_above_100_mio_23, 464)
install_map_above_100_mio_24 = create_install_map_above_100(install_map_2024)
install_count_table_24 = create_table_installation_counts(install_map_above_100_mio_24, 464)


In [None]:
install_count_table_23

In [None]:
install_count_table_24

In [None]:
create_classified_file_table(above_100_mio, format=True)

In [None]:
detector_above_100_mio = get_stats_detector_platform(above_100_mio, matching, app_ids=android_app_above_100_mio)

In [None]:
detector_above_100_mio.drop(columns=[col for col in detector_above_100_mio.columns if col.startswith('invalid')], inplace=True)
detector_above_100_mio.drop(columns=[col for col in detector_above_100_mio.columns if col.startswith('percentage_')], inplace=True)

print(detector_above_100_mio.to_latex())

In [None]:
detector_above_100_mio

In [None]:
get_dttest_platforms(above_100_mio_verified, android_app_above_100_mio)

In [None]:
for k,v in above_100_mio_verified.items():
    print(k)
    print(v.apps)
    print(v.files)
    print(v.detector)

In [None]:
compare_android_and_ios(above_100_mio_verified, matching)


In [None]:
get_formated_results(get_android_ios_app_stats(above_100_mio_verified,matching))

In [None]:
for k,v in jwt_2024.items():
    print(v.apps)
    print(v.files)



In [None]:
def create_current_results_year(results_1, matching_apps):
    results_1 = {k:v for k,v in results_1.items() if v.verified}
    results_both = {k:v for k,v in results_1.items() for app in v.apps if app.platform == "ios" and AppInfo(matching_apps[app.name], "android") in v.apps}
    results_android = {k:v for k,v in results_1.items() for app in v.apps if app.platform == "android"}
    results_ios = {k:v for k,v in results_1.items() for app in v.apps if app.platform == "ios"}
    return {"both": len(results_both), "android": len(results_android)-len(results_both), "ios": len(results_ios)-len(results_both)}

def compare_verified_years(results_1, results_2):
    results_1_verified = {k:v for k,v in results_1.items() if v.verified}
    results_2_verified = {k:v for k,v in results_2.items() if v.verified}
    result = {}
    for k,v in results_1_verified.items():
        if k in results_2_verified:
            result["old"] = result.get("old", 0) + 1
        else:
            result["old_valid"] = result.get("old_valid", 0) + 1

    for k,v in results_2_verified.items():
        if k not in results_1_verified:
            result["new"] = result.get("new", 0) + 1
    return result


def create_sankey_comparison(results_1, results_2, matching_apps):
    result_values = {"2023": {"ios": 0, "android": 0, "both": 0}, "2024": {"ios": 0, "android": 0, "both": 0}, "comparison": {"old": 0, "new": 0, "old_valid": 0}}
    result_values["2023"] = create_current_results_year(results_1, matching_apps)
    result_values["2024"] = create_current_results_year(results_2, matching_apps)
    result_values["comparison"] = compare_verified_years(results_1, results_2)


    labels = ["Both", "Android", "iOS", "2023", "New", "2024", "Removed", "Both", "Android", "iOS"]

    sources = []
    targets = []
    values = []

    # findings_platform -> 2023
    sources += [0, 1, 2]  # both_2023, android_2023, ios_2023
    targets += [3, 3, 3]  # 2023
    values += [result_values['2023']['both'], result_values['2023']['android'], result_values['2023']['ios']]

    # 2023 -> 2024
    sources += [3, 4, 3] # old, new, old
    targets += [5, 5, 6] # 2024, removed but valid
    values += [result_values['comparison']['old'], result_values['comparison']['new'], result_values['comparison']['old_valid']]

    # 2024 -> findings_platform
    sources += [5, 5, 5] # 2024
    targets += [7, 8, 9]  # both_2024, android_2024, ios_2024

    values += [result_values['2024']['both'], result_values['2024']['android'], result_values['2024']['ios']]

    color_for_nodes = ["#808080", "#a4c639", "#313131", "#A9A9A9", "#FFA500" ,  "#A9A9A9", "#d10b0b", "#808080", "#a4c639","#313131"]
    # Compute total values for each node
    total_values = [0] * len(labels)
    for i, val in enumerate(values):
        total_values[targets[i]] += val
    for i, val in enumerate(values):
        total_values[sources[i]] += val
    
    total_values[3] = int(total_values[3]/2)
    total_values[5] = int(total_values[5]/2)

    x = [0.12 - 0.1, 0.12 - 0.1, 0.12 - 0.1, 0.22, 0.29 + 0.1, 0.36 + 0.15, 0.29 + 0.1 ,0.46 + 0.25, 0.46 + 0.25, 0.46 + 0.25]
    y = [0.35, 0.07, 0.68, 0.50, 0.85, 0.50, 0.08 ,0.35, 0.07, 0.68]

    y = [i + 0.1 for i in y]

    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
                        x = x,
            y = y
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    ))
    fig.update_traces(node_color=color_for_nodes)
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.102 - 0.1, y=0.715, showarrow=False, text=total_values[0]))
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.102 - 0.1, y=0.985, showarrow=False, text=total_values[1]))
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.095 - 0.1, y=0.4, showarrow=False, text=total_values[2]))


    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.195, y=0.80, showarrow=False, text=total_values[3])) # 2023
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.29 + 0.1, y=0.15 , showarrow=False, text=total_values[4])) # new



    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.36 + 0.15, y=0.80, showarrow=False, text=total_values[5])) # 2024

    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.29 + 0.1, y=0.985, showarrow=False, text=total_values[6])) # Removed

    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.46 + 0.27, y=0.69, showarrow=False, text=total_values[7])) # both
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.46 + 0.27, y=0.99, showarrow=False, text=total_values[8])) # android
    fig.add_annotation(dict(font=dict(color="black",size=15), x=0.46 + 0.275, y=0.42, showarrow=False, text=total_values[9])) # ios6
    #fig.update_layout(font_size=10, annotations=annotations)
    return fig


In [None]:
fig = create_sankey_comparison(results_2023, results_2024, matching)

In [None]:
import plotly.io as pio

pio.write_image(fig, 'sankey_comparison.pdf')

In [None]:
# Filtering stats
all_secret_canditaes = get_all_infos(db_2023)
all_secret_canditaes_2024 = get_all_infos(db_2024)

In [None]:
results_secrets, rule_ids, unique_per_file = get_unique_secrets(all_secret_canditaes)
results_secrets_24, rule_ids_24, unique_per_file_24 = get_unique_secrets(all_secret_canditaes_2024)

In [None]:
# results_secrets - secret - to apps

def create_filter_app_number_table(secrets_apps, valid, cutoff = 25):
    result = {}
    for secret, apps in secrets_apps.items():
        #if len(apps) > cutoff:
        #    continue
        current = result.get(len(apps), {})
        if secret in valid:
            current["valid"] = current.get("valid", 0) + 1
        else:
            current["invalid"] = current.get("invalid", 0) + 1

        result[len(apps)] = current

    for k,v in result.items():
        if "valid" not in v:
            v["valid"] = 0
        if "invalid" not in v:
            v["invalid"] = 0
        v["percentage"] = f"{(v['valid']/(v['valid']+v['invalid']))*100:.2f}%"

    df = pd.DataFrame(result).transpose()
    df = df.sort_index()
    return df


def create_filter_file_number_table(files_detector_secrets, valid, cutoff = 30):
    lowest = {}
    result = {}
    for files, dectector_map in files_detector_secrets.items():
        for detector in dectector_map.keys():
            current_len = len(dectector_map[detector])
            if cutoff and current_len > cutoff:
                continue
            for secret in dectector_map[detector]:
                if secret not in lowest:
                    lowest[secret] = current_len
                else:
                    lowest[secret] = min(lowest[secret], current_len)


    for secret, current_len in lowest.items():
        current = result.get(current_len, {})
        if secret in valid:
            current["valid"] = current.get("valid", 0) + 1
        else:
            current["invalid"] = current.get("invalid", 0) + 1
        result[current_len] = current

    for k,v in result.items():
        if "valid" not in v:
            v["valid"] = 0
        if "invalid" not in v:
            v["invalid"] = 0
        v["percentage"] = f"{(v['valid']/(v['valid']+v['invalid']))*100:.2f}%"

    df = pd.DataFrame(result).transpose()
    df = df.sort_index()
    return df



In [None]:
def create_combined_app_number_table(results_secrets_1, results_secrets_2, results_both_verified):
    df_1 = create_filter_app_number_table(results_secrets_1, results_both_verified)
    df_2 = create_filter_app_number_table(results_secrets_2, results_both_verified)
    df_1.loc['>=15'] = df_1[df_1.index >= 15].sum()
    df_2.loc['>=15'] = df_2[df_2.index >= 15].sum()
    #df_1 = df_1.drop(df_1[type(df_1.index) == type(1) and df_1.index >= 15].index)
    #df_2 = df_2.drop(df_2[type(df_1.index) == type(1) and df_2.index >= 15].index)

    df =  df_1.join(df_2, lsuffix="_2023", rsuffix="_2024")
    df = df.fillna(0)
    return df


def create_combined_file_number_table(results_secrets_1, results_secrets_2, results_both_verified, cutt_off = 15):
    df_1 = create_filter_file_number_table(results_secrets_1, results_both_verified, cutoff=1000000)
    df_2 = create_filter_file_number_table(results_secrets_2, results_both_verified, cutoff=1000000)
    df_1_sum =  df_1[df_1.index >= cutt_off].sum()
    df_2_sum =  df_2[df_2.index >= cutt_off].sum()

    df_1 = df_1.drop(df_1[df_1.index >= cutt_off].index)
    df_2 = df_2.drop(df_2[df_2.index >= cutt_off].index)
    df_1.loc[f'>={cutt_off}'] = df_1_sum
    df_2.loc[f'>={cutt_off}'] = df_2_sum
    df =  df_1.join(df_2, lsuffix="_2023", rsuffix="_2024")
    df = df.fillna(0)
    return df


def create_total_file_number_table(results_secrets_1, results_secrets_2, results_both_verified, cutt_off = 15):
    new_results = join_dict(results_secrets_1, results_secrets_2)

    df_1 = create_filter_file_number_table(new_results, results_both_verified, cutoff=None)
    df_1_total = df_1.sum()

    df_1_sum =  df_1[df_1.index >= cutt_off].sum()
    df_1_sum_v2 = df_1[(df_1.index > 9) & (df_1.index < cutt_off)].sum()

    df_1 = df_1.drop(df_1[df_1.index >= cutt_off].index)
    df_1 = df_1.drop(df_1[(df_1.index > 9) & (df_1.index < cutt_off)].index)
    df_1.loc[f'>9 & <{cutt_off}'] = df_1_sum_v2
    df_1.loc[f'>{cutt_off-1}'] = df_1_sum
    df_1.loc['Total'] = df_1_total
    df_1 = df_1.fillna(0)
    return df_1

In [None]:
23730+388 - 14559

In [None]:
heurisitc_1 = create_total_file_number_table(unique_per_file, unique_per_file_24, results_both_verified, cutt_off=15)

In [None]:
print(heurisitc_1.to_latex())

In [None]:
combined_apps = create_combined_app_number_table(results_secrets, results_secrets_24, results_both_verified)

In [None]:
create_combined_file_number_table(unique_per_file, unique_per_file_24, results_both_verified, cutt_off=15)

In [None]:
combined_apps