In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from modules.PreProcess import PreProcessURLS
from modules.FeatureExtraction import FeatureExtractionURLS
import joblib
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split



In [16]:

rawUrlFilePath= 'dataset/te.csv'
preProcessedFilePath = 'dataset/newprePro.csv'

In [None]:
    
FeatureExtractionURLS(rawUrlFilePath)

In [5]:
PreProcessURLS().mergeFiles(merged_file_path = preProcessedFilePath)

Merging complete. Merged file saved as: dataset/newprePro.csv


In [2]:
# clean output chuncks
PreProcessURLS().deleteFilesInDirectory('output/')

In [17]:

file_path = preProcessedFilePath  
df = pd.read_csv(file_path)

In [18]:

df = df.drop('getDomain', axis=1)

predict_rows = df.sample(15, random_state=42)
df = df.drop(predict_rows.index)

df_Y = df['labels']
df_X = df.drop('labels', axis=1)

X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20, random_state=42)

predict_features = predict_rows.drop('labels', axis=1)
predict_labels = predict_rows['labels']

In [19]:


base_learners_set1 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance'))]

base_learners_set2 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set3 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set4 = [('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]



In [22]:
best_precision_accuracy=0

base_learners = [base_learners_set1, base_learners_set2, base_learners_set3, base_learners_set4]

for idx, base_learner_group in enumerate(base_learners):
    meta_learner = LogisticRegression()
    clf = StackingClassifier(estimators=base_learner_group, final_estimator=meta_learner)

    # Train the model on the full training data
    clf.fit(X_train, y_train)
    
    test_predictions = clf.predict(X_test)
    precision = precision_score(y_test, test_predictions) * 100
    accuracy = accuracy_score(y_test, test_predictions) * 100
    combined_metric = (precision + accuracy) / 2 

    print(f'Model Configuration {idx + 1} - Test Set Evaluation:')
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('F1 Score:', f1_score(y_test, test_predictions) * 100)
    print('Recall:', recall_score(y_test, test_predictions) * 100)
    print('ROC AUC:', roc_auc_score(y_test, test_predictions) * 100)
    print('Confusion Matrix:', confusion_matrix(y_test, test_predictions))
    print('Combined Metric:', combined_metric)
    print('-----------------------------------------\n')

    # Predict on the 15 rows and compare with actual labels
    predict_predictions = clf.predict(predict_features)
    print('Prediction on 15 Rows:')
    print('Predicted Labels:', predict_predictions)
    print('Actual Labels:', predict_labels.values)
    print('-----------------------------------------\n')

    # Save the best configuration and model based on the combined metric
    if combined_metric > best_precision_accuracy:
        best_precision_accuracy = combined_metric
        best_config = base_learner_group
        best_model = clf


print(f'The best model configuration is {best_config} with a combined metric of {best_precision_accuracy:.2f}')





Model Configuration 1 - Test Set Evaluation:
Accuracy: 94.82758620689656
Precision: 95.45454545454545
F1 Score: 87.5
Recall: 80.76923076923077
ROC AUC: 89.82905982905983
Confusion Matrix: [[89  1]
 [ 5 21]]
Combined Metric: 95.141065830721
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 1 0 1 0 0 0 0 1 0 0 0 0 1 0]
Actual Labels: [0 1 0 1 0 0 0 0 1 0 0 0 1 0 0]
-----------------------------------------





Model Configuration 2 - Test Set Evaluation:
Accuracy: 93.96551724137932
Precision: 88.0
F1 Score: 86.27450980392157
Recall: 84.61538461538461
ROC AUC: 90.64102564102564
Confusion Matrix: [[87  3]
 [ 4 22]]
Combined Metric: 90.98275862068965
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 1 0 0 0 0 0 0 1 0 0 0 1 1 0]
Actual Labels: [0 1 0 1 0 0 0 0 1 0 0 0 1 0 0]
-----------------------------------------

Model Configuration 3 - Test Set Evaluation:
Accuracy: 94.82758620689656
Precision: 95.45454545454545
F1 Score: 87.5
Recall: 80.76923076923077
ROC AUC: 89.82905982905983
Confusion Matrix: [[89  1]
 [ 5 21]]
Combined Metric: 95.141065830721
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 1 0 1 0 0 0 0 1 0 0 0 0 1 0]
Actual Labels: [0 1 0 1 0 0 0 0 1 0 0 0 1 0 0]
-----------------------------------------





Model Configuration 4 - Test Set Evaluation:
Accuracy: 94.82758620689656
Precision: 95.45454545454545
F1 Score: 87.5
Recall: 80.76923076923077
ROC AUC: 89.82905982905983
Confusion Matrix: [[89  1]
 [ 5 21]]
Combined Metric: 95.141065830721
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 1 0 1 0 0 0 0 1 0 0 0 0 1 0]
Actual Labels: [0 1 0 1 0 0 0 0 1 0 0 0 1 0 0]
-----------------------------------------

The best model configuration is [('rf', RandomForestClassifier(criterion='entropy', min_samples_split=3)), ('mlp', MLPClassifier(alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive',
              max_iter=500)), ('knn', KNeighborsClassifier(leaf_size=15, n_neighbors=20, p=1, weights='distance'))] with a combined metric of 95.14




In [23]:

joblib.dump(best_model, 'model/NewBest_model.pkl')

['model/NewBest_model.pkl']