# Test of the clasification pipeline

In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = False

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    input_files = [
        {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 4000, 'label': 'benign'},
        {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 480, 'label': 'phishing'},
        {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    ]
    
    input_files = [
        {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 300, 'label': 'benign'},
        {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 308, 'label': 'phishing'},
        {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    ]


    # Number of rows to select in total
    #n_rows = 5072
    n_rows = 1200
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/ver.parquet')


## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/ver.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [4]:
import pandas as pd
import numpy as np
import shap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from classifiers.pipeline import Pipeline
from classifiers.options import PipelineOptions
#import matplotlib.pyplot as plt

# Initialize the classification pipeline
clf_options = PipelineOptions()
clf = Pipeline(clf_options)

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Function to map labels to 'benign' or 'malign'
def map_label(label):
    if label == 'benign':
        return 'benign'
    else:
        return f'malign ({label})'

# Function to convert labels to binary classes
def binary_label(label):
    return 'negative' if label == 'benign' else 'positive'

# Apply label mapping
input_df['mapped_label'] = input_df['label'].apply(map_label)
input_df['binary_label'] = input_df['label'].apply(binary_label)

# Ensure SHAP JavaScript initialization
#shap.initjs()

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Initialize counters for overall statistics
total_true_labels = []
total_pred_labels = []

# Format string for aligned output
header_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:<10}"
data_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:.6f}"

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label', 'mapped_label', 'binary_label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    # Collect predictions and true labels
    true_labels = chunk_df['binary_label'].values
    pred_labels = []
    for result in chunk_results:
        pred_label = 'negative' if result['aggregate_probability'] < 0.5 else 'positive'
        pred_labels.append(pred_label)

    # Update overall statistics
    total_true_labels.extend(true_labels)
    total_pred_labels.extend(pred_labels)

    # Display header for results
    print(header_format_str.format("Res", "Domain Name", "Actual Label", "Predicted", "Probability"))

    # Display results for each domain
    for idx, result in enumerate(chunk_results):
        actual_label = chunk_df.iloc[idx]['mapped_label']
        predicted_label = 'benign' if pred_labels[idx] == 'negative' else 'malign'
        domain_name = result['domain_name']
        aggregate_probability = result['aggregate_probability']
        status = "OK" if pred_labels[idx] == true_labels[idx] else "ER"
        print(data_format_str.format(status, domain_name[:50], actual_label, predicted_label, aggregate_probability))
        
        # Run debug_domain method for misclassified domains
        #if status == "ER":
            #print(f"Debugging misclassified domain: {domain_name}")
            #ndf_data = clf.pp.df_to_NDF(chunk_df, "phishing")  # Convert to NDF
            #debug_data = clf.clf_phishing_cnn.debug_domain(domain_name, ndf_data, chunk_df, n_top_features=10)
            
            # Print out the top n feature importances and values for each classifier
            #for classifier, data in debug_data.items():
            #    print(f"\nClassifier: {classifier}")
            #    print(f"Top {len(data['top_features'])} features for domain '{domain_name}':")
            #    for feature_info in data['top_features']:
            #        print(f"Feature: {feature_info['feature']}, Value: {feature_info['value']}, SHAP Value: {feature_info['shap_value']}")
                
                # Display the force plot for phishing_lgbm
                #if classifier == "phishing_cnn":
                #    base_value, shap_values, domain_row = data['force_plot_data']
                #    shap.force_plot(base_value, shap_values, domain_row)
                #    plt.show()

    # Calculate metrics for the current chunk
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, pos_label='positive', average='binary')
    recall = recall_score(true_labels, pred_labels, pos_label='positive', average='binary')
    f1 = f1_score(true_labels, pred_labels, pos_label='positive', average='binary')

    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels, labels=['negative', 'positive']).ravel()
    false_positives = fp
    false_negatives = fn
    total_positives = tp + fp
    total_negatives = tn + fn

    fp_ratio = (false_positives / total_positives) if total_positives > 0 else 0
    fn_ratio = (false_negatives / total_negatives) if total_negatives > 0 else 0

    print(f"Chunk {i+1}/{num_chunks} metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"False Positives: {false_positives} ({fp_ratio * 100:.2f}%)")
    print(f"False Negatives: {false_negatives} ({fn_ratio * 100:.2f}%)")
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

# Calculate overall metrics
overall_accuracy = accuracy_score(total_true_labels, total_pred_labels)
overall_precision = precision_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_recall = recall_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_f1 = f1_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')

overall_tn, overall_fp, overall_fn, overall_tp = confusion_matrix(total_true_labels, total_pred_labels, labels=['negative', 'positive']).ravel()
overall_false_positives = overall_fp
overall_false_negatives = overall_fn
overall_total_positives = overall_tp + overall_fp
overall_total_negatives = overall_tn + overall_fn

overall_fp_ratio = (overall_false_positives / overall_total_positives) if overall_total_positives > 0 else 0
overall_fn_ratio = (overall_false_negatives / overall_total_negatives) if overall_total_negatives > 0 else 0

print("Overall metrics:")
print(f"Overall Accuracy: {overall_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1 Score: {overall_f1}")
print(f"Overall False Positives: {overall_false_positives} ({overall_fp_ratio * 100:.2f}%)")
print(f"Overall False Negatives: {overall_false_negatives} ({overall_fn_ratio * 100:.2f}%)")


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
2024-07-03 15:48:54.812447: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 15:48:54.812472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 15:48:54.813206: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 15:48:54.817372: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other 

CNN model created
===== Processing chunk 1/40 =====
Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | www.everyoneloveswood.com                          | malign (malware)   | malign     | 0.634493
OK  | cpanel.rocketsmm.net                               | malign (malware)   | malign     | 0.592318
ER  | agkohezrh.info                                     | malign (dga)       | benign     | 0.413436
OK  | www.enjoylifeworks.co.jp                           | benign             | benign     | 0.000000
OK  | 1pia4srtfabx1ubr2s1fh3uth.net                      | malign (dga)       | malign     | 0.541325
OK  | virze.cz                                           | benign             | benign     | 0.000000
OK  | www.shopinportdouglas.com                          | malign (malware)   | malign     | 0.698988
OK  | s69.bwgteamstar.com                                | malign (phishing)  | malign     | 0.754854
OK  | sde77744.pages.dev   

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | dh.jeanyeung111.link                               | malign (phishing)  | malign     | 0.630041
OK  | sdk.dcmn.io                                        | benign             | benign     | 0.000000
OK  | turbologo.com                                      | benign             | benign     | 0.000000
OK  | blpost.top                                         | malign (malware)   | malign     | 0.756539
OK  | www.vlkoid.cz                                      | benign             | benign     | 0.000000
ER  | 3102jtnu.computer-mod-s.tech                       | malign (phishing)  | benign     | 0.000000
ER  | hwmiphf.sc                                         | malign (dga)       | benign     | 0.262938
ER  | nfivtldcsa.net                                     | malign (dga)       | benign     | 0.437629
ER  | vrehxlgskcv.work                                   | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | 588c9baba1a8c5c75e29aaf7154bb607.antoniowawr.warsz | malign (malware)   | malign     | 0.597535
OK  | page-risks-see-more.vercel.app                     | malign (phishing)  | malign     | 0.702223
OK  | en.supplypoland.com                                | malign (malware)   | malign     | 0.874276
OK  | openseaprojtsclaimz13.vercel.app                   | malign (phishing)  | malign     | 0.684641
OK  | snjqjwbswujlpgfbipafgnqn.cx                        | malign (dga)       | malign     | 0.574007
ER  | mj-api.botai.top                                   | malign (phishing)  | benign     | 0.081699
OK  | www.gepair.com                                     | benign             | benign     | 0.000000
OK  | lodgerbnbaproving550901.com                        | malign (phishing)  | malign     | 0.730218
OK  | api.bobole.net                                     | benign             |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | gvrjnwpnmdtlrll.info                               | malign (dga)       | malign     | 0.566494
OK  | vazhotipol.info                                    | malign (malware)   | malign     | 0.891068
OK  | atlokemin.wixsite.com                              | malign (phishing)  | malign     | 0.721204
OK  | telegramsexxxforyou.pages.dev                      | malign (phishing)  | malign     | 0.738380
OK  | youngtrailblazer.click                             | malign (malware)   | malign     | 0.757785
ER  | cccfnuarpe.info                                    | malign (dga)       | benign     | 0.320916
ER  | timlpoinzssvfoom.org                               | malign (dga)       | benign     | 0.427153
ER  | zuklin.marinadobrawa.pl                            | malign (malware)   | benign     | 0.218237
OK  | c460.despachodeabogadosmcr.com                     | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | oluchukwuabali.github.io                           | malign (phishing)  | malign     | 0.726201
ER  | readydesigns.com                                   | benign             | malign     | 0.544202
OK  | detacenested.com                                   | malign (malware)   | malign     | 0.885466
ER  | rabuvb.com                                         | malign (dga)       | benign     | 0.424964
OK  | lwnltxpmmmthkiv.com                                | malign (dga)       | malign     | 0.565089
OK  | lsewftpxypfpxexc.ug                                | malign (dga)       | malign     | 0.552340
OK  | fqbhsytlgvyjqnmhlaag.xxx                           | malign (dga)       | malign     | 0.564400
OK  | steel-confused-myrtle.glitch.me                    | malign (phishing)  | malign     | 0.648371
OK  | xy111x225x218x203xy.mcdn.bilivideo.cn              | benign             |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
ER  | dblink.qwer.men                                    | malign (phishing)  | benign     | 0.361965
OK  | zgktpslfgyflqi.com                                 | malign (dga)       | malign     | 0.559443
OK  | commonsenseafricasblog.com                         | malign (malware)   | malign     | 0.774262
OK  | renouvellement-mensuel.com                         | malign (phishing)  | malign     | 0.694444
OK  | gerasqyter.xyz                                     | malign (malware)   | malign     | 0.776080
OK  | fdge5vpb2ta0yzwymu8kwgbcuitpy2gz4uklaoil1vtg1903na | malign (phishing)  | malign     | 0.734173
OK  | resolvasaude.com.br                                | malign (phishing)  | malign     | 0.646964
OK  | stellarshine.live                                  | malign (malware)   | malign     | 0.876393
ER  | pop.lennoxparkandrec.com                           | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | bustrip-images.bushikaku.net                       | benign             | benign     | 0.000000
OK  | ingposts.top                                       | malign (malware)   | malign     | 0.755829
OK  | 860567204.tapecontent.net                          | benign             | benign     | 0.000000
ER  | vidmate.network                                    | benign             | malign     | 0.618111
ER  | mbubebxogjudywh.mn                                 | malign (dga)       | benign     | 0.309255
ER  | losgjcdjtgxuwaxd.kz                                | malign (dga)       | benign     | 0.449429
OK  | j0hnuf1o3un26167kk5t1jbqlqq.biz                    | malign (dga)       | malign     | 0.532033
OK  | 212-222-29-200.edge.agora.io                       | benign             | benign     | 0.000000
ER  | alonesafety.net                                    | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | t.me-join.com                                      | malign (phishing)  | malign     | 0.712874
ER  | uni.pangdao.org                                    | malign (phishing)  | benign     | 0.000000
OK  | v1.oddsserve.com                                   | benign             | benign     | 0.000000
ER  | www.shevvstudios.com                               | malign (malware)   | benign     | 0.000000
ER  | w611ut5uo.hier-im-netz.de                          | malign (phishing)  | benign     | 0.000000
OK  | configserver-drcn.platform.hihonorcloud.com        | benign             | benign     | 0.000000
OK  | okolicznosciwypadku.click                          | malign (malware)   | malign     | 0.782152
ER  | qhzepe.com                                         | malign (dga)       | benign     | 0.471524
OK  | nptnhxyjhxvkoot.biz                                | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | www.hledampraci.cz                                 | benign             | benign     | 0.000000
ER  | idvtnkyboupid.net                                  | malign (dga)       | benign     | 0.492162
OK  | outlander-lb.shop                                  | benign             | benign     | 0.080492
OK  | bafybeidvkboe27s67yrcdr74gfgwh26fhiidh2l53rmh55mvg | malign (phishing)  | malign     | 0.722543
OK  | poly.skoda-auto.cz                                 | benign             | benign     | 0.000000
OK  | justt-watch-now.xyz                                | malign (malware)   | malign     | 0.655597
OK  | starlancercenter.com                               | malign (malware)   | malign     | 0.765985
ER  | ayicsu.com                                         | malign (dga)       | benign     | 0.457488
OK  | item-list22119.atwebpages.com                      | malign (phishing)  |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | 1xgkpcuwpf99f34050011gky86.org                     | malign (dga)       | malign     | 0.540713
OK  | transportkazan.ru                                  | benign             | benign     | 0.307614
ER  | agyncivbkee.work                                   | malign (dga)       | benign     | 0.290173
OK  | www.sposobzyciowy.click                            | malign (malware)   | malign     | 0.862252
OK  | jxjydzlvmzk67eqnygxnqaypui15l18d60o61.com          | malign (dga)       | malign     | 0.691881
OK  | c93ec95a80b21090fd7e34e305abcca9.nitwos.pl         | malign (malware)   | malign     | 0.851181
ER  | fjz.fm                                             | malign (dga)       | benign     | 0.058376
OK  | taghleefpack.ignacyklaudiusz.pl                    | malign (malware)   | malign     | 0.539619
OK  | cwzqkxrvlmifrqq.com                                | malign (dga)       |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | rutabagacalculusar.pro                             | malign (malware)   | malign     | 0.880818
OK  | nakitel.com                                        | benign             | benign     | 0.000000
OK  | pixel.jobteaser.com                                | benign             | benign     | 0.000000
OK  | 19rgz8h16w8khxhfcv6p10vhsam.com                    | malign (dga)       | malign     | 0.579144
OK  | aspb1.cdn.asset.aparat.com                         | benign             | benign     | 0.000000
OK  | www.technikapoznani.cz                             | benign             | benign     | 0.000000
ER  | www.fahuwk.com                                     | benign             | malign     | 0.687804
ER  | pacificobdpec.ac-page.com                          | malign (phishing)  | benign     | 0.492176
OK  | pub-e5c2c5311c064a7bb152dc2d6b4b0d49.r2.dev        | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | bt-business-5-22-2024.webflow.io                   | malign (phishing)  | malign     | 0.707325
OK  | hmxfsdrvxfq.ga                                     | malign (dga)       | malign     | 0.546189
ER  | promptly-complete-bay-renewable.trycloudflare.com  | malign (phishing)  | benign     | 0.144298
OK  | bt-business-5-21-2024.webflow.io                   | malign (phishing)  | malign     | 0.711079
OK  | 1jiltjpvihzbmfn5stckufmtv.com                      | malign (dga)       | malign     | 0.573986
OK  | www.olomoucdnes.cz                                 | benign             | benign     | 0.000000
ER  | inncome.manasan.top                                | malign (malware)   | benign     | 0.447529
OK  | mehakramzan.github.io                              | malign (phishing)  | malign     | 0.722061
OK  | www.graphpad.com                                   | benign             |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | pxqmxxmkzdbddfiedgqbx.net                          | malign (dga)       | malign     | 0.553537
OK  | hw-sh-pcdn-17.biliapi.net                          | benign             | benign     | 0.000000
OK  | exchange.cryptex.net                               | benign             | benign     | 0.000000
ER  | smalldawn.hn                                       | malign (dga)       | benign     | 0.032010
ER  | 0x-code.financialmarketsworld.com                  | malign (malware)   | benign     | 0.196055
ER  | foxmodeq.com                                       | benign             | malign     | 0.724762
OK  | vsosyyuxlvvfpxr.org                                | malign (dga)       | malign     | 0.594934
OK  | ridertools.metrarail.com                           | benign             | benign     | 0.000000
OK  | urbanabodes.homes                                  | malign (malware)   |

Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | attnet-106864.weeblysite.com                       | malign (phishing)  | malign     | 0.734555
OK  | www.windguru.cz                                    | benign             | benign     | 0.000000
ER  | waitthere.net                                      | malign (dga)       | benign     | 0.303061
OK  | www.fibstaff.com                                   | malign (malware)   | malign     | 0.882098
OK  | 0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl          | malign (malware)   | malign     | 0.743454
ER  | wepik.com                                          | malign (phishing)  | benign     | 0.000000
ER  | temperate-kangaroo-kw0hgq.mystrikingly.com         | malign (phishing)  | benign     | 0.459561
OK  | pikolmnfjihjuu.com                                 | malign (dga)       | malign     | 0.556550
OK  | www.habitatetjardin.com                            | benign             |

## Optional: Generate preliminary results for training the final aggregation classifier

In [4]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

#preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=False)

2024-07-03 15:01:05.772085: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 15:01:05.772106: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 15:01:05.772856: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 15:01:05.777188: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-03 15:01:06.938209: E external/local_xla/xla/

CNN model created


In [5]:
pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result,malware_lgbm_result,malware_xgboost_result,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result,dga_binary_deepnn_result,dga_binary_lgbm_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,dga_binary_sum,dga_binary_avg,dga_binary_prod,total_sum,total_avg,total_prod,label
0,wisdomscholars.com,1.000000,0.725000,1.000000,0.750000,1.000000,0.500000,1.000000,0.291667,1.000000,0.722222,0.935031,0.994439,0.991256,0.883857,0.057567,0.753173,0.093928,0.184353,0.956685,0.935702,0.732948,0.386309,0.182575,0.238424,0.279973,0.065536,0.000337,4.893604,0.611701,0.000612,3.712617,0.530374,0.003089,0.065873,0.032937,0.000022,8.672094,0.510123,0.000000,malware
1,stats.futureweb.at,0.650000,0.325000,1.000000,0.750000,1.000000,0.625000,0.833333,0.291667,1.000000,0.666667,0.001283,0.059505,0.002969,0.000659,0.152775,0.001936,0.006728,0.178120,0.000443,0.002364,0.000027,0.085749,0.001294,0.022765,0.341488,0.000000,0.000003,0.403977,0.050497,0.000000,0.454130,0.064876,0.000000,0.000003,0.000002,0.000000,0.858110,0.050477,0.000000,benign
2,woppinto.duckdns.org,0.675000,0.325000,0.041667,0.000000,1.000000,0.625000,1.000000,0.583333,1.000000,0.722222,0.993279,0.658288,0.895738,0.999964,0.999766,0.973423,0.390085,0.429732,0.013774,0.353591,0.676899,0.999450,0.031765,0.307506,0.387937,0.000000,0.000008,6.340275,0.792534,0.095545,2.770922,0.395846,0.000012,0.000008,0.000004,0.000000,9.111206,0.535953,0.000000,phishing
3,pacificobdpec.ac-page.com,0.600000,0.250000,1.000000,0.791667,1.000000,0.875000,1.000000,0.833333,1.000000,1.000000,0.814275,0.151813,0.024485,0.985616,0.240719,0.036080,0.512537,0.665002,0.000903,0.002626,0.033487,0.191421,0.004590,0.223286,0.312961,0.000000,0.000001,3.430527,0.428816,0.000009,0.769275,0.109896,0.000000,0.000001,0.000001,0.000000,4.199804,0.247047,0.000000,phishing
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,0.600000,0.250000,1.000000,0.833333,1.000000,0.875000,0.958333,0.666667,1.000000,0.722222,0.993279,0.590505,0.847595,0.998712,0.207163,0.381047,0.521558,0.674459,0.256077,0.132916,0.797197,0.186566,0.427601,0.399883,0.342416,0.000000,0.001259,5.214318,0.651790,0.013787,2.542657,0.363237,0.000296,0.001259,0.000630,0.000000,7.758235,0.456367,0.000000,malware
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,sde77744.pages.dev,0.900000,0.600000,1.000000,0.833333,1.000000,0.875000,1.000000,0.833333,1.000000,1.000000,0.993279,0.608244,0.746095,0.999995,0.989056,0.928009,0.134651,0.527282,0.000072,0.002233,0.000000,0.414147,0.000036,0.000013,0.000059,0.000000,0.000001,5.926611,0.740826,0.029374,0.416562,0.059509,0.000000,0.000001,0.000001,0.000000,6.343174,0.373128,0.000000,phishing
1196,pic3.iqiyipic.com,0.600000,0.200000,1.000000,0.750000,1.000000,0.375000,1.000000,0.583333,1.000000,0.666667,0.005312,0.001055,0.001184,0.017804,0.005678,0.004933,0.004338,0.292419,0.000487,0.002408,0.000046,0.087584,0.051795,0.060200,0.006217,0.000031,0.000018,0.332722,0.041590,0.000000,0.208737,0.029820,0.000000,0.000049,0.000025,0.000000,0.541509,0.031853,0.000000,benign
1197,lopwkftvliagjpghtmj.us,0.475000,0.000000,0.041667,0.000000,0.625000,0.000000,0.833333,0.000000,1.000000,0.000000,0.329686,0.001121,0.002307,0.132502,0.184629,0.020158,0.387495,0.414404,0.005901,0.010848,0.010164,0.394762,0.024412,0.604780,0.443381,0.999813,0.999940,1.472302,0.184038,0.000000,1.494249,0.213464,0.000000,1.999753,0.999876,0.999753,4.966304,0.292136,0.000000,dga
1198,8650oped.computer-mod-s.tech,0.600000,0.300000,1.000000,0.833333,1.000000,0.875000,1.000000,0.833333,1.000000,0.722222,0.974122,0.105811,0.005161,0.158851,0.098541,0.033686,0.521558,0.604932,0.000850,0.002626,0.003297,0.056795,0.004575,0.399883,0.159036,0.000000,0.000000,2.502661,0.312833,0.000000,0.627063,0.089580,0.000000,0.000000,0.000000,0.000000,3.129724,0.184101,0.000000,phishing


In [6]:
pd.set_option('display.max_rows', None)
#preliminary_results_df[preliminary_results_df["label"].isin(["benign", "phishing"])][["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result", "phishing_geo_nn_result", "phishing_ip_nn_result"]]
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "phishing"])][["domain_name", "label", "phishing_cnn_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result", "phishing_geo_nn_result", "phishing_ip_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_xgboost_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result
1,stats.futureweb.at,benign,0.001283,0.002969,0.152775,0.001936,0.006728,0.17812
2,woppinto.duckdns.org,phishing,0.993279,0.895738,0.999766,0.973423,0.390085,0.429732
3,pacificobdpec.ac-page.com,phishing,0.814275,0.024485,0.240719,0.03608,0.512537,0.665002
6,attnet-106864.weeblysite.com,phishing,1.0,0.998485,0.987409,0.995463,0.390085,0.209473
7,cf.bmzk.link,phishing,0.883891,0.009202,0.214686,0.036916,0.397401,0.534836
8,8966msah.computer-mod-s.tech,phishing,0.971233,0.00501,0.098541,0.033686,0.521558,0.604665
10,barosan77.tripod.com,phishing,0.006805,0.001089,0.383864,0.003479,0.390085,0.11638
11,view.email.ralphlauren.co.uk,benign,0.002006,0.00047,0.092274,0.005583,0.09455,0.240045
12,mbmb.pages.dev,phishing,0.99024,0.455739,0.97736,0.611549,0.385592,0.578969
13,login-att-communication-109040.weeblysite.com,phishing,1.0,0.978927,0.987409,0.995463,0.390085,0.209436


In [7]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result
0,wisdomscholars.com,malware,0.732948,0.956685,0.935702
1,stats.futureweb.at,benign,2.7e-05,0.000443,0.002364
2,woppinto.duckdns.org,phishing,0.676899,0.013774,0.353591
3,pacificobdpec.ac-page.com,phishing,0.033487,0.000903,0.002626
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,malware,0.797197,0.256077,0.132916
5,cfd.hadl-cab.com,malware,0.761132,0.914706,0.851169
6,attnet-106864.weeblysite.com,phishing,0.000759,0.001629,0.002626
7,cf.bmzk.link,phishing,0.089197,0.003877,0.003996
8,8966msah.computer-mod-s.tech,phishing,0.018075,0.00085,0.002626
9,uwvgeroutklrums.info,dga,0.004608,0.017808,0.088723


In [8]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result
0,wisdomscholars.com,malware,0.732948,0.956685,0.935702,0.386309,0.182575,0.238424
1,stats.futureweb.at,benign,2.7e-05,0.000443,0.002364,0.085749,0.001294,0.022765
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,malware,0.797197,0.256077,0.132916,0.186566,0.427601,0.399883
5,cfd.hadl-cab.com,malware,0.761132,0.914706,0.851169,0.197743,0.434186,0.688216
11,view.email.ralphlauren.co.uk,benign,0.000131,0.000534,0.002664,0.20754,0.012542,0.275012
16,www.gepair.com,benign,0.033125,0.012946,0.089669,0.123352,0.099892,0.364639
18,www.vantenk.shop,malware,0.051647,0.273976,0.313256,0.118659,0.224325,0.60478
20,www.transfermarkt.com.tr,benign,1.4e-05,0.000164,0.002233,0.005151,0.000377,0.307506
21,storage.polycam.io,benign,5e-06,0.000371,0.002408,0.079055,0.007341,0.223286
24,www.dankor.co.uk,benign,6e-06,0.000689,0.002752,0.104519,0.046661,0.307506


In [9]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result", "malware_ip_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result
0,wisdomscholars.com,malware,0.732948,0.386309,0.182575,0.238424,0.279973
1,stats.futureweb.at,benign,2.7e-05,0.085749,0.001294,0.022765,0.341488
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,malware,0.797197,0.186566,0.427601,0.399883,0.342416
5,cfd.hadl-cab.com,malware,0.761132,0.197743,0.434186,0.688216,0.78777
11,view.email.ralphlauren.co.uk,benign,0.000131,0.20754,0.012542,0.275012,0.329426
16,www.gepair.com,benign,0.033125,0.123352,0.099892,0.364639,0.333355
18,www.vantenk.shop,malware,0.051647,0.118659,0.224325,0.60478,0.444184
20,www.transfermarkt.com.tr,benign,1.4e-05,0.005151,0.000377,0.307506,0.000115
21,storage.polycam.io,benign,5e-06,0.079055,0.007341,0.223286,0.007017
24,www.dankor.co.uk,benign,6e-06,0.104519,0.046661,0.307506,0.004726


In [10]:
#print(preliminary_results_df['malware_geo_nn_result'].unique())
#for value in preliminary_results_df['malware_geo_nn_result'].unique():
#    print(value)

In [11]:
preliminary_results_df[["label", "dga_binary_deepnn_result", "dga_binary_lgbm_result"]]

Unnamed: 0,label,dga_binary_deepnn_result,dga_binary_lgbm_result
0,malware,0.065536,0.000337
1,benign,0.0,3e-06
2,phishing,0.0,8e-06
3,phishing,0.0,1e-06
4,malware,0.0,0.001259
5,malware,1e-06,2e-06
6,phishing,7.8e-05,1e-06
7,phishing,2.1e-05,5.4e-05
8,phishing,0.0,0.0
9,dga,0.999788,0.999845


In [12]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_deepnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_dns_nn_result,phishing_rdap_nn_result
0,wisdomscholars.com,malware,0.935031,0.883857,0.994439,0.991256,0.057567,0.753173
1,stats.futureweb.at,benign,0.001283,0.000659,0.059505,0.002969,0.152775,0.001936
2,woppinto.duckdns.org,phishing,0.993279,0.999964,0.658288,0.895738,0.999766,0.973423
3,pacificobdpec.ac-page.com,phishing,0.814275,0.985616,0.151813,0.024485,0.240719,0.03608
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,malware,0.993279,0.998712,0.590505,0.847595,0.207163,0.381047
5,cfd.hadl-cab.com,malware,0.970334,0.999943,0.995961,0.968935,0.173971,0.347343
6,attnet-106864.weeblysite.com,phishing,1.0,0.99999,0.999458,0.998485,0.987409,0.995463
7,cf.bmzk.link,phishing,0.883891,0.988083,0.021238,0.009202,0.214686,0.036916
8,8966msah.computer-mod-s.tech,phishing,0.971233,0.184894,0.107421,0.00501,0.098541,0.033686
9,uwvgeroutklrums.info,dga,0.019529,0.267421,0.002519,0.010781,0.184629,0.020158


In [13]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result
0,wisdomscholars.com,malware,0.732948,0.956685,0.935702
1,stats.futureweb.at,benign,2.7e-05,0.000443,0.002364
2,woppinto.duckdns.org,phishing,0.676899,0.013774,0.353591
3,pacificobdpec.ac-page.com,phishing,0.033487,0.000903,0.002626
4,0503c3bdb2699f8d02db98d87d97f4ea.loftx.pl,malware,0.797197,0.256077,0.132916
5,cfd.hadl-cab.com,malware,0.761132,0.914706,0.851169
6,attnet-106864.weeblysite.com,phishing,0.000759,0.001629,0.002626
7,cf.bmzk.link,phishing,0.089197,0.003877,0.003996
8,8966msah.computer-mod-s.tech,phishing,0.018075,0.00085,0.002626
9,uwvgeroutklrums.info,dga,0.004608,0.017808,0.088723


In [14]:
preliminary_results_df[["domain_name", "label", "badness_probability"]]

KeyError: "['badness_probability'] not in index"

In [None]:
preliminary_results_df