# Test of the clasification pipeline

In [11]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Directory has already been changed.


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [12]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    input_files = [
        {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 4000, 'label': 'benign'},
        {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 480, 'label': 'phishing'},
        {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    ]


    # Number of rows to select in total
    n_rows = 5072
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/ver.parquet')


## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/ver.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [4]:
import pandas as pd
import numpy as np
import shap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from classifiers.pipeline import Pipeline
from classifiers.options import PipelineOptions
#import matplotlib.pyplot as plt

# Initialize the classification pipeline
clf_options = PipelineOptions()
clf = Pipeline(clf_options)

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Function to map labels to 'benign' or 'malign'
def map_label(label):
    if label == 'benign':
        return 'benign'
    else:
        return f'malign ({label})'

# Function to convert labels to binary classes
def binary_label(label):
    return 'negative' if label == 'benign' else 'positive'

# Apply label mapping
input_df['mapped_label'] = input_df['label'].apply(map_label)
input_df['binary_label'] = input_df['label'].apply(binary_label)

# Ensure SHAP JavaScript initialization
#shap.initjs()

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Initialize counters for overall statistics
total_true_labels = []
total_pred_labels = []

# Format string for aligned output
header_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:<10}"
data_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:.6f}"

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label', 'mapped_label', 'binary_label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    # Collect predictions and true labels
    true_labels = chunk_df['binary_label'].values
    pred_labels = []
    for result in chunk_results:
        pred_label = 'negative' if result['aggregate_probability'] < 0.5 else 'positive'
        pred_labels.append(pred_label)

    # Update overall statistics
    total_true_labels.extend(true_labels)
    total_pred_labels.extend(pred_labels)

    # Display header for results
    print(header_format_str.format("Res", "Domain Name", "Actual Label", "Predicted", "Probability"))

    # Display results for each domain
    for idx, result in enumerate(chunk_results):
        actual_label = chunk_df.iloc[idx]['mapped_label']
        predicted_label = 'benign' if pred_labels[idx] == 'negative' else 'malign'
        domain_name = result['domain_name']
        aggregate_probability = result['aggregate_probability']
        status = "OK" if pred_labels[idx] == true_labels[idx] else "ER"
        print(data_format_str.format(status, domain_name[:50], actual_label, predicted_label, aggregate_probability))
        
        # Run debug_domain method for misclassified domains
        #if status == "ER":
            #print(f"Debugging misclassified domain: {domain_name}")
            #ndf_data = clf.pp.df_to_NDF(chunk_df, "phishing")  # Convert to NDF
            #debug_data = clf.clf_phishing_cnn.debug_domain(domain_name, ndf_data, chunk_df, n_top_features=10)
            
            # Print out the top n feature importances and values for each classifier
            #for classifier, data in debug_data.items():
            #    print(f"\nClassifier: {classifier}")
            #    print(f"Top {len(data['top_features'])} features for domain '{domain_name}':")
            #    for feature_info in data['top_features']:
            #        print(f"Feature: {feature_info['feature']}, Value: {feature_info['value']}, SHAP Value: {feature_info['shap_value']}")
                
                # Display the force plot for phishing_lgbm
                #if classifier == "phishing_cnn":
                #    base_value, shap_values, domain_row = data['force_plot_data']
                #    shap.force_plot(base_value, shap_values, domain_row)
                #    plt.show()

    # Calculate metrics for the current chunk
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, pos_label='positive', average='binary')
    recall = recall_score(true_labels, pred_labels, pos_label='positive', average='binary')
    f1 = f1_score(true_labels, pred_labels, pos_label='positive', average='binary')

    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels, labels=['negative', 'positive']).ravel()
    false_positives = fp
    false_negatives = fn
    total_positives = tp + fp
    total_negatives = tn + fn

    fp_ratio = (false_positives / total_positives) if total_positives > 0 else 0
    fn_ratio = (false_negatives / total_negatives) if total_negatives > 0 else 0

    print(f"Chunk {i+1}/{num_chunks} metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"False Positives: {false_positives} ({fp_ratio * 100:.2f}%)")
    print(f"False Negatives: {false_negatives} ({fn_ratio * 100:.2f}%)")
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

# Calculate overall metrics
overall_accuracy = accuracy_score(total_true_labels, total_pred_labels)
overall_precision = precision_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_recall = recall_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_f1 = f1_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')

overall_tn, overall_fp, overall_fn, overall_tp = confusion_matrix(total_true_labels, total_pred_labels, labels=['negative', 'positive']).ravel()
overall_false_positives = overall_fp
overall_false_negatives = overall_fn
overall_total_positives = overall_tp + overall_fp
overall_total_negatives = overall_tn + overall_fn

overall_fp_ratio = (overall_false_positives / overall_total_positives) if overall_total_positives > 0 else 0
overall_fn_ratio = (overall_false_negatives / overall_total_negatives) if overall_total_negatives > 0 else 0

print("Overall metrics:")
print(f"Overall Accuracy: {overall_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1 Score: {overall_f1}")
print(f"Overall False Positives: {overall_false_positives} ({overall_fp_ratio * 100:.2f}%)")
print(f"Overall False Negatives: {overall_false_negatives} ({overall_fn_ratio * 100:.2f}%)")


  from .autonotebook import tqdm as notebook_tqdm
2024-08-19 12:14:52.115471: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


CNN model created
===== Processing chunk 1/40 =====
Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | www.everyoneloveswood.com                          | malign (malware)   | malign     | 0.659568
OK  | cpanel.rocketsmm.net                               | malign (malware)   | malign     | 0.574511
OK  | agkohezrh.info                                     | malign (dga)       | malign     | 0.561479
OK  | www.enjoylifeworks.co.jp                           | benign             | benign     | 0.000000
OK  | 1pia4srtfabx1ubr2s1fh3uth.net                      | malign (dga)       | malign     | 0.620326
OK  | virze.cz                                           | benign             | benign     | 0.000000
OK  | www.shopinportdouglas.com                          | malign (malware)   | malign     | 0.635465
OK  | s69.bwgteamstar.com                                | malign (phishing)  | malign     | 0.684637
OK  | sde77744.pages.dev   

KeyboardInterrupt: 

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | bustrip-images.bushikaku.net                       | benign             | benign     | 0.000000
OK  | ingposts.top                                       | malign (malware)   | malign     | 0.749935
OK  | 860567204.tapecontent.net                          | benign             | benign     | 0.000000
ER  | vidmate.network                                    | benign             | malign     | 0.716517
OK  | mbubebxogjudywh.mn                                 | malign (dga)       | malign     | 0.591484
OK  | losgjcdjtgxuwaxd.kz                                | malign (dga)       | malign     | 0.617779
OK  | j0hnuf1o3un26167kk5t1jbqlqq.biz                    | malign (dga)       | malign     | 0.645681
OK  | 212-222-29-200.edge.agora.io                       | benign             | benign     | 0.000000
OK  | alonesafety.net                                    | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | t.me-join.com                                      | malign (phishing)  | malign     | 0.715045
ER  | uni.pangdao.org                                    | malign (phishing)  | benign     | 0.000000
OK  | v1.oddsserve.com                                   | benign             | benign     | 0.000000
ER  | www.shevvstudios.com                               | malign (malware)   | benign     | 0.000000
ER  | w611ut5uo.hier-im-netz.de                          | malign (phishing)  | benign     | 0.000000
ER  | configserver-drcn.platform.hihonorcloud.com        | benign             | malign     | 0.564372
OK  | okolicznosciwypadku.click                          | malign (malware)   | malign     | 0.750293
OK  | qhzepe.com                                         | malign (dga)       | malign     | 0.611728
OK  | nptnhxyjhxvkoot.biz                                | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | www.hledampraci.cz                                 | benign             | benign     | 0.000000
OK  | idvtnkyboupid.net                                  | malign (dga)       | malign     | 0.631900
ER  | outlander-lb.shop                                  | benign             | malign     | 0.504027
OK  | bafybeidvkboe27s67yrcdr74gfgwh26fhiidh2l53rmh55mvg | malign (phishing)  | malign     | 0.704089
OK  | poly.skoda-auto.cz                                 | benign             | benign     | 0.000000
OK  | justt-watch-now.xyz                                | malign (malware)   | malign     | 0.739149
OK  | starlancercenter.com                               | malign (malware)   | malign     | 0.754605
OK  | ayicsu.com                                         | malign (dga)       | malign     | 0.600150
OK  | item-list22119.atwebpages.com                      | malign (phishing)  |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | 1xgkpcuwpf99f34050011gky86.org                     | malign (dga)       | malign     | 0.656312
ER  | transportkazan.ru                                  | benign             | malign     | 0.549271
OK  | agyncivbkee.work                                   | malign (dga)       | malign     | 0.579991
OK  | www.sposobzyciowy.click                            | malign (malware)   | malign     | 0.854492
OK  | jxjydzlvmzk67eqnygxnqaypui15l18d60o61.com          | malign (dga)       | malign     | 0.689876
OK  | c93ec95a80b21090fd7e34e305abcca9.nitwos.pl         | malign (malware)   | malign     | 0.852340
ER  | fjz.fm                                             | malign (dga)       | benign     | 0.000000
OK  | taghleefpack.ignacyklaudiusz.pl                    | malign (malware)   | malign     | 0.687742
OK  | cwzqkxrvlmifrqq.com                                | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | rutabagacalculusar.pro                             | malign (malware)   | malign     | 0.859254
OK  | nakitel.com                                        | benign             | benign     | 0.000000
OK  | pixel.jobteaser.com                                | benign             | benign     | 0.000000
OK  | 19rgz8h16w8khxhfcv6p10vhsam.com                    | malign (dga)       | malign     | 0.657988
OK  | aspb1.cdn.asset.aparat.com                         | benign             | benign     | 0.000000
OK  | www.technikapoznani.cz                             | benign             | benign     | 0.000000
ER  | www.fahuwk.com                                     | benign             | malign     | 0.728230
OK  | pacificobdpec.ac-page.com                          | malign (phishing)  | malign     | 0.692917
OK  | pub-e5c2c5311c064a7bb152dc2d6b4b0d49.r2.dev        | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | bt-business-5-22-2024.webflow.io                   | malign (phishing)  | malign     | 0.693447
OK  | hmxfsdrvxfq.ga                                     | malign (dga)       | malign     | 0.652697
ER  | promptly-complete-bay-renewable.trycloudflare.com  | malign (phishing)  | benign     | 0.000000
OK  | bt-business-5-21-2024.webflow.io                   | malign (phishing)  | malign     | 0.697082
OK  | 1jiltjpvihzbmfn5stckufmtv.com                      | malign (dga)       | malign     | 0.659230
OK  | www.olomoucdnes.cz                                 | benign             | benign     | 0.000000
OK  | inncome.manasan.top                                | malign (malware)   | malign     | 0.627488
OK  | mehakramzan.github.io                              | malign (phishing)  | malign     | 0.712318
OK  | www.graphpad.com                                   | benign             |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | pxqmxxmkzdbddfiedgqbx.net                          | malign (dga)       | malign     | 0.653313
OK  | hw-sh-pcdn-17.biliapi.net                          | benign             | benign     | 0.000000
OK  | exchange.cryptex.net                               | benign             | benign     | 0.000000
ER  | smalldawn.hn                                       | malign (dga)       | benign     | 0.000000
ER  | 0x-code.financialmarketsworld.com                  | malign (malware)   | benign     | 0.463569
ER  | foxmodeq.com                                       | benign             | malign     | 0.738716
OK  | vsosyyuxlvvfpxr.org                                | malign (dga)       | malign     | 0.666033
OK  | ridertools.metrarail.com                           | benign             | benign     | 0.000000
OK  | urbanabodes.homes                                  | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | attnet-106864.weeblysite.com                       | malign (phishing)  | malign     | 0.704819
OK  | www.windguru.cz                                    | benign             | benign     | 0.000000
ER  | waitthere.net                                      | malign (dga)       | benign     | 0.282252
OK  | www.fibstaff.com                                   | malign (malware)   | malign     | 0.847769
OK  | 0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl          | malign (malware)   | malign     | 0.739372
ER  | wepik.com                                          | malign (phishing)  | benign     | 0.000000
OK  | temperate-kangaroo-kw0hgq.mystrikingly.com         | malign (phishing)  | malign     | 0.640199
OK  | pikolmnfjihjuu.com                                 | malign (dga)       | malign     | 0.653120
OK  | www.habitatetjardin.com                            | benign             |

## Optional: Generate preliminary results for training the final aggregation classifier

In [5]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

#preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=False)

CNN model created


In [6]:
pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result,malware_lgbm_result,malware_xgboost_result,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result,malware_residual_result,malware_cnn_result,malware_gru_result,malware_deep_result,dga_binary_deepnn_result,dga_binary_lgbm_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,dga_binary_sum,dga_binary_avg,dga_binary_prod,total_sum,total_avg,total_prod,label
0,goniecgminny.rent,1.000000,0.725000,1.000000,0.791667,1.000000,0.875000,1.000000,0.833333,1.000000,0.722222,0.441888,0.575147,0.302416,0.803104,0.028180,0.024105,0.521558,0.608652,0.908546,0.927555,0.848032,0.870210,0.240215,0.399883,0.484413,0.000012,0.000000,0.000000,0.000000,0.000006,0.000412,3.305049,0.413131,0.000013,4.678867,0.425352,0.000000,0.000419,0.000209,0.000000,7.984334,0.380206,0.000000,malware
1,payu.saleseekerpro.com,0.600000,0.250000,1.000000,0.833333,1.000000,0.750000,1.000000,0.625000,1.000000,1.000000,0.960558,0.994648,0.997657,0.999993,0.131234,0.285963,0.397401,0.493041,0.961419,0.876182,0.852390,0.326868,0.473563,0.688216,0.733579,0.000000,0.000000,0.000000,0.000000,0.000535,0.000045,5.260495,0.657562,0.007009,4.912216,0.446565,0.000000,0.000579,0.000290,0.000000,10.173290,0.484442,0.000000,malware
2,lpnsrmqwhsoqmlu.info,0.475000,0.000000,0.041667,0.000000,0.625000,0.000000,0.833333,0.000000,1.000000,0.000000,0.102486,0.003072,0.014435,0.852422,0.184629,0.020158,0.387495,0.414404,0.017808,0.097543,0.004060,0.394762,0.024412,0.604780,0.443381,0.946741,0.000000,0.000000,0.873985,1.000000,0.999995,1.979101,0.247388,0.000000,3.407473,0.309770,0.000000,1.999995,0.999997,0.999995,7.386569,0.351741,0.000000,dga
3,11saa8z1tek3iw3w9nf5znnqkf.net,0.475000,0.000000,0.041667,0.000000,0.625000,0.000000,0.833333,0.000000,1.000000,0.000000,0.834184,0.001429,0.001089,0.128223,0.184629,0.020158,0.387495,0.414404,0.018208,0.092381,0.013243,0.394762,0.024412,0.604780,0.443381,0.806388,0.000000,0.000000,1.000000,0.999944,0.999961,1.971611,0.246451,0.000000,3.397557,0.308869,0.000000,1.999904,0.999952,0.999904,7.369072,0.350908,0.000000,dga
4,at-t-servers-dandy-site.webflow.io,0.600000,0.225000,1.000000,0.750000,1.000000,0.375000,1.000000,0.625000,1.000000,0.666667,0.977560,0.525113,0.936627,0.945796,0.518518,0.962041,0.067348,0.244332,0.000476,0.002408,0.000000,0.029114,0.002633,0.245350,0.005845,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.177335,0.647167,0.003733,0.285826,0.025984,0.000000,0.000000,0.000000,0.000000,5.463161,0.260151,0.000000,phishing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1qh6lud1x68wn7koenp518x5xky.biz,0.475000,0.000000,0.041667,0.000000,0.625000,0.000000,0.833333,0.000000,1.000000,0.000000,0.905300,0.001358,0.002286,0.234898,0.184629,0.020158,0.387495,0.414404,0.005901,0.010048,0.004117,0.394762,0.024412,0.604780,0.443381,0.954437,0.000000,0.000000,0.950294,1.000000,0.999994,2.150528,0.268816,0.000000,3.392132,0.308376,0.000000,1.999994,0.999997,0.999994,7.542655,0.359174,0.000000,dga
1196,wielkatajemnica.lat,1.000000,0.725000,1.000000,0.833333,1.000000,0.875000,1.000000,0.833333,1.000000,0.722222,0.789184,0.783868,0.159832,0.871470,0.033671,0.024996,0.521558,0.576096,0.919937,0.961809,0.864291,0.936081,0.217307,0.399883,0.903021,0.748247,0.000000,0.000000,0.153696,0.296606,0.001248,3.760674,0.470084,0.000022,6.104272,0.554934,0.000000,0.297854,0.148927,0.000370,10.162800,0.483943,0.000000,malware
1197,smalldawn.hn,0.475000,0.000000,0.041667,0.000000,0.625000,0.000000,0.833333,0.000000,1.000000,0.000000,0.108064,0.000777,0.000843,0.031383,0.184629,0.020158,0.387495,0.414404,0.005832,0.008991,0.004214,0.394762,0.024412,0.604780,0.443381,0.509361,0.000000,0.000000,0.269366,0.017727,0.000348,1.147754,0.143469,0.000000,2.265099,0.205918,0.000000,0.018075,0.009038,0.000006,3.430928,0.163378,0.000000,dga
1198,secureatt-101972-106707.weeblysite.com,0.600000,0.200000,1.000000,0.750000,1.000000,0.625000,1.000000,0.708333,1.000000,0.722222,1.000000,0.998665,0.998047,0.999926,0.987409,0.995463,0.390085,0.209453,0.001653,0.002626,0.001997,0.149622,0.008672,0.307506,0.277922,0.993954,0.000000,0.000000,1.000000,0.000000,0.000001,6.579046,0.822381,0.080040,2.743951,0.249450,0.000000,0.000001,0.000000,0.000000,9.322998,0.443952,0.000000,phishing


In [7]:
pd.set_option('display.max_rows', None)
#preliminary_results_df[preliminary_results_df["label"].isin(["benign", "phishing"])][["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result", "phishing_geo_nn_result", "phishing_ip_nn_result"]]
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "phishing"])][["domain_name", "label", "phishing_cnn_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result", "phishing_geo_nn_result", "phishing_ip_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_xgboost_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result
4,at-t-servers-dandy-site.webflow.io,phishing,0.97756,0.936627,0.518518,0.962041,0.067348,0.244332
7,nwk.yxynmsl.tk,phishing,0.890197,0.001014,0.214652,0.049558,0.397401,0.534321
15,specializedlink.com,benign,0.114907,0.284938,0.207434,0.060965,0.16914,0.18133
16,pavanhyundai.com,phishing,0.96215,0.130164,0.056306,0.01289,0.390085,0.155399
18,2969zsgc.computer-mod-s.tech,phishing,0.978077,0.00501,0.098541,0.033686,0.521558,0.604779
20,coibaseaausignin.gitbook.io,phishing,0.986389,0.989648,0.487484,0.793998,0.512537,0.658995
21,aniketvish0.github.io,phishing,0.980461,0.76669,0.967532,0.874906,0.700693,0.779685
25,beleza-naturais-kosmetikshop.de,phishing,0.084134,0.003053,0.034776,0.00196,0.172503,0.040757
26,www.transfermarkt.com.tr,benign,0.0062,0.002746,0.024699,0.000669,0.390085,0.025997
28,rowman.com,benign,0.001152,0.00067,0.001723,0.001162,0.081688,0.221449


In [11]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_residual_result", "malware_gru_result", "malware_deep_result", "malware_cnn_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_residual_result,malware_gru_result,malware_deep_result,malware_cnn_result
0,goniecgminny.rent,malware,0.848032,1.2e-05,0.0,0.0,0.0
1,payu.saleseekerpro.com,malware,0.85239,0.0,0.0,0.0,0.0
2,lpnsrmqwhsoqmlu.info,dga,0.00406,0.946741,0.0,0.873985,0.0
3,11saa8z1tek3iw3w9nf5znnqkf.net,dga,0.013243,0.806388,0.0,1.0,0.0
4,at-t-servers-dandy-site.webflow.io,phishing,0.0,0.0,0.0,0.0,0.0
5,fmgajwiaogxlbympnmak.cx,dga,0.01219,0.965517,0.0,0.003435,0.0
6,futurebuy.pl,malware,0.197853,0.000207,0.0,0.001446,0.0
7,nwk.yxynmsl.tk,phishing,0.165783,0.0,0.0,0.0,0.0
8,abstraction.bit,dga,0.000724,0.091952,0.0,0.0,0.0
9,familylifeconnections.com,malware,0.83097,0.999994,0.0,1.0,0.0


In [8]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result
0,olkano.com,dga,0.001556,0.01397,0.047478
1,brandstorm.loreal.com,benign,0.000122,0.000514,0.002626
2,wytypowany-zwyciezca.com,malware,0.759562,0.383536,0.286979
3,oopt.fr,malware,0.12285,0.277118,0.445704
4,16chan.org,malware,0.627538,0.708378,0.589509
5,down.t0kbnpobket.biz,phishing,0.000658,0.010174,0.005187
6,bolsadetrabajosgt.com,phishing,0.0,6.7e-05,0.002233
7,www.sprawozdania24.click,malware,0.849272,0.677257,0.90705
8,discord.writemall.top,phishing,0.000252,0.000411,0.002364
9,patasblancasvet.com,malware,0.794451,0.881981,0.884218


In [9]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result
1,brandstorm.loreal.com,benign,0.000122,0.000514,0.002626,0.184353,0.001344,0.399883
2,wytypowany-zwyciezca.com,malware,0.759562,0.383536,0.286979,0.775626,0.021642,0.399883
3,oopt.fr,malware,0.12285,0.277118,0.445704,0.070304,0.024303,0.209001
4,16chan.org,malware,0.627538,0.708378,0.589509,0.941982,0.04969,0.605018
7,www.sprawozdania24.click,malware,0.849272,0.677257,0.90705,0.589228,0.398529,0.399883
9,patasblancasvet.com,malware,0.794451,0.881981,0.884218,0.888655,0.061971,0.689959
10,www.ipop.at,benign,2.1e-05,0.000861,0.003839,0.032429,0.175987,0.129151
13,vurbont.xyz,malware,0.889243,0.986667,0.955447,0.398776,0.310197,0.399883
14,srebny23.aleksytymon.com.pl,malware,0.000637,0.001621,0.002364,0.33502,0.000711,0.399883
15,tools.pinpoll.com,benign,8e-06,0.000501,0.002408,0.076731,0.047117,0.016066


In [10]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result", "malware_ip_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result
1,brandstorm.loreal.com,benign,0.000122,0.184353,0.001344,0.399883,0.548636
2,wytypowany-zwyciezca.com,malware,0.759562,0.775626,0.021642,0.399883,0.212463
3,oopt.fr,malware,0.12285,0.070304,0.024303,0.209001,0.450995
4,16chan.org,malware,0.627538,0.941982,0.04969,0.605018,0.279973
7,www.sprawozdania24.click,malware,0.849272,0.589228,0.398529,0.399883,0.452909
9,patasblancasvet.com,malware,0.794451,0.888655,0.061971,0.689959,0.330984
10,www.ipop.at,benign,2.1e-05,0.032429,0.175987,0.129151,0.00627
13,vurbont.xyz,malware,0.889243,0.398776,0.310197,0.399883,0.462252
14,srebny23.aleksytymon.com.pl,malware,0.000637,0.33502,0.000711,0.399883,0.637277
15,tools.pinpoll.com,benign,8e-06,0.076731,0.047117,0.016066,0.005857


In [11]:
#print(preliminary_results_df['malware_geo_nn_result'].unique())
#for value in preliminary_results_df['malware_geo_nn_result'].unique():
#    print(value)

In [12]:
preliminary_results_df[["label", "dga_binary_deepnn_result", "dga_binary_lgbm_result"]]

Unnamed: 0,label,dga_binary_deepnn_result,dga_binary_lgbm_result
0,dga,0.922759,0.195645
1,benign,0.0,2e-06
2,malware,0.156716,0.007199
3,malware,0.136516,0.001036
4,malware,0.027879,0.00108
5,phishing,1e-06,0.001439
6,phishing,0.482309,0.006404
7,malware,0.0,0.01045
8,phishing,0.0,6e-06
9,malware,0.226647,0.003


In [13]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_deepnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_dns_nn_result,phishing_rdap_nn_result
0,olkano.com,dga,0.993279,3e-06,0.00061,0.000632,0.184629,0.020158
1,brandstorm.loreal.com,benign,0.012323,0.000363,0.004557,0.001432,0.370768,0.004173
2,wytypowany-zwyciezca.com,malware,0.017531,0.851645,0.02322,0.003387,0.235347,0.003177
3,oopt.fr,malware,0.001619,0.006011,0.007934,0.021717,0.022058,0.006285
4,16chan.org,malware,0.080616,0.004772,0.019884,0.053516,0.027071,0.025906
5,down.t0kbnpobket.biz,phishing,0.972836,0.999845,0.931129,0.930527,0.079166,0.243867
6,bolsadetrabajosgt.com,phishing,0.963146,0.99474,0.032518,0.115738,0.023512,0.003566
7,www.sprawozdania24.click,malware,0.166446,0.974665,0.025775,0.063857,0.268165,0.006818
8,discord.writemall.top,phishing,0.514197,0.897723,0.313745,0.154308,0.153307,0.01844
9,patasblancasvet.com,malware,0.176411,0.966612,0.01673,0.104643,0.541051,0.017393


In [14]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result
0,olkano.com,dga,0.001556,0.01397,0.047478
1,brandstorm.loreal.com,benign,0.000122,0.000514,0.002626
2,wytypowany-zwyciezca.com,malware,0.759562,0.383536,0.286979
3,oopt.fr,malware,0.12285,0.277118,0.445704
4,16chan.org,malware,0.627538,0.708378,0.589509
5,down.t0kbnpobket.biz,phishing,0.000658,0.010174,0.005187
6,bolsadetrabajosgt.com,phishing,0.0,6.7e-05,0.002233
7,www.sprawozdania24.click,malware,0.849272,0.677257,0.90705
8,discord.writemall.top,phishing,0.000252,0.000411,0.002364
9,patasblancasvet.com,malware,0.794451,0.881981,0.884218


In [15]:
preliminary_results_df[["domain_name", "label", "badness_probability"]]

KeyError: "['badness_probability'] not in index"

In [None]:
preliminary_results_df