In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
import numpy as np
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("./cert_fingerprint.csv")
print(df.head())

In [None]:
df_verify = pd.read_csv("./cert_pem_results.csv")
print(df.head())

In [None]:
columns = ['info', 'warn', 'error', 'fatal']
total_rows = len(df_verify)

for col in columns:
    non_zero_count = (df_verify[col] != 0).sum()
    proportion = non_zero_count / total_rows
    print(f'Column {col}: {non_zero_count}/{total_rows} {proportion}')

In [None]:
cert_id = df['CERT_ID']
fingerprint_cols = df.columns[df.columns.str.startswith('FINGERPRINT_')]
fingerprint_data = df[fingerprint_cols]
data = fingerprint_data.values

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [None]:
iso_forest = IsolationForest(contamination=0.03, random_state=42)
iso_forest.fit(data_scaled)
df['IF_SCORE'] = iso_forest.decision_function(data_scaled)
df['IF_LABEL'] = iso_forest.predict(data_scaled)

In [None]:
anomalies_if = df[df['IF_LABEL'] == -1]
print("Isolation Forest detected anomalies:")
print(anomalies_if[['CERT_ID']])
print(len(anomalies_if))

In [None]:
error_cert = df_verify[df_verify['error'] != 0]
print(error_cert)

In [None]:
iso_forest_df = pd.concat([df['CERT_ID'],df['IF_LABEL']], axis=1)
iso_forest_df.to_csv("cert_if.csv",index=False)

In [None]:
def get_accuracy(base_df, test_df, test_col):
    merged_df = pd.merge(base_df, test_df, on='cert_id', how='inner')
    base_col = 'error'
    TP = len(merged_df[(merged_df[base_col] != 0) & (merged_df[test_col] == -1)])
    
    # TN: base_col 阴性, test_col 阴性
    TN = len(merged_df[(merged_df[base_col] == 0) & (merged_df[test_col] != -1)])
    
    # FP: base_col 阴性, test_col 阳性
    FP = len(merged_df[(merged_df[base_col] == 0) & (merged_df[test_col] == -1)])
    
    # FN: base_col 阳性, test_col 阴性
    FN = len(merged_df[(merged_df[base_col] != 0) & (merged_df[test_col] != -1)])

    print("TN:",TN,"FP:",FP)
    print("FN:",FN,"TP:",TP)
    print ("TPR:",TP/(TP+FN))
    print ("TNR:",TN/(TN+FP))
    print ("FPR:",FP/(TN+FP))
    print ("FNR:",FN/(TP+FN))
    
    return (TN, TP, FN, FP, FP/(TN+FP), TP/(TP+FN))

test_df = iso_forest_df.rename(columns={'CERT_ID': 'cert_id'})
tn,tp,fn,fp,_,_ = get_accuracy(df_verify[['cert_id','error']],test_df[['cert_id','IF_LABEL']],'IF_LABEL') 


In [None]:
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.03)
df['LOF_SCORE'] = lof.fit_predict(data_scaled)
df['LOF_SCORE'] = -lof.negative_outlier_factor_

threshold = sorted(df['LOF_SCORE'], reverse=True)[int(len(df) * 0.03) - 1]
df['LOF_LABEL'] = df['LOF_SCORE'] > threshold

In [None]:
lof_df = pd.concat([df['CERT_ID'],df['LOF_LABEL']], axis=1)
lof_df.loc[lof_df['LOF_LABEL'],"LOF_LABEL"]=-1
lof_df.loc[lof_df['LOF_LABEL']==False,"LOF_LABEL"]=1
lof_df.to_csv("cert_lof.csv",index=False)

In [None]:
test_df = lof_df.rename(columns={'CERT_ID': 'cert_id'})
tn,tp,fn,fp,_,_ = get_accuracy(df_verify[['cert_id','error']],test_df[['cert_id','LOF_LABEL']],'LOF_LABEL') 