In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler

In [48]:
df = pd.read_csv("../../Data/processed/final.csv") 

In [49]:
df['has_https'] = df['url'].str.startswith('https://').astype(int)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545484 entries, 0 to 545483
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   url                        545484 non-null  object 
 1   label                      545484 non-null  object 
 2   num_dots                   545484 non-null  int64  
 3   num_hyphens                545484 non-null  int64  
 4   num_slashes                545484 non-null  int64  
 5   num_digits                 545484 non-null  int64  
 6   num_equals                 545484 non-null  int64  
 7   ip_in_url                  545484 non-null  int64  
 8   has_highly_susp_keyword    545484 non-null  int64  
 9   hostname_length            545484 non-null  int64  
 10  has_additional_susp_words  545484 non-null  int64  
 11  has_susp_word_in_path      545484 non-null  int64  
 12  has_susp_word_in_hostname  545484 non-null  int64  
 13  has_suspicious_tld         54

In [51]:
TARGET_COL = 'result'

feature_cols = [
    'num_dots',
    'num_hyphens', 
    'num_slashes',
    'num_digits',
    'num_equals',
    'ip_in_url',
    'has_highly_susp_keyword',
    'hostname_length',
    'has_additional_susp_words',
    'has_susp_word_in_path',
    'has_susp_word_in_hostname',
    'has_suspicious_tld',
    'has_uncommon_tld',
    'url_entropy',
    'subdomain_count',
    'no_www',
    'tld_length',
    'short_url',
    'consonant_pair_ratio',
    'punctuation_density',
    'hex_encoding_count',
    'num_subdirs',
    'AtSymbol',
    # 'has_https',
    'NumUnderscore',
    'HasQueryString',
    # 'filename_length',

    'path_length',
    
    'max_dir_length',

    'hostname_uncommonness',
    'longest_dir_uncommonness'
]


X = df[feature_cols] 
y = df[TARGET_COL]   

In [52]:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression

# model = LogisticRegression(max_iter=1000)

# rfe = RFE(model, n_features_to_select=10)

# rfe.fit(X, y)

# selected_features = rfe.support_

# selected_feature_names = X.columns[selected_features]
# print("Selected features by RFE:", selected_feature_names.tolist())

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski')
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)

In [55]:
from IPython.display import Markdown, display
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

display(Markdown(f"**Accuracy:** `{accuracy:.4f}`"))
display(Markdown(f"**Matrice de confusion:**\n```\n{conf}\n```"))
display(Markdown(f"**Rapport de classification:**\n```\n{report}\n```"))

**Accuracy:** `0.9858`

**Matrice de confusion:**
```
[[68621   527]
 [ 1023 38926]]
```

**Rapport de classification:**
```
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     69148
         1.0       0.99      0.97      0.98     39949

    accuracy                           0.99    109097
   macro avg       0.99      0.98      0.98    109097
weighted avg       0.99      0.99      0.99    109097

```

In [56]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

conf = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

display(Markdown(f"**precision:** `{precision:.4f}`"))
display(Markdown(f"**recall:** `{recall:.4f}`"))
display(Markdown(f"**f1:** `{f1:.4f}`"))

**precision:** `0.9866`

**recall:** `0.9744`

**f1:** `0.9805`

In [57]:
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train_scaled, y_train)
# y_pred = knn.predict(X_test_scaled)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [58]:
# knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
# knn.fit(X_train_scaled, y_train)

# y_pred = knn.predict(X_test_scaled)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))


In [59]:
# false_negatives_mask = (y_test == 1) & (y_pred == 0)
# false_negatives_indices = X_test[false_negatives_mask].index

# false_negatives = df.loc[false_negatives_indices]

# print(f"Number of false negatives: {len(false_negatives)}")
# print("\nSample of false negative URLs:")
# print(false_negatives[['url', 'label', 'hostname_length', 'url_entropy', 'num_dots', 'num_slashes', 'num_digits']].head(10))

# malicious_mask = df['result'] == 1
# true_malicious = df[malicious_mask]

# print("\nFeature averages comparison:")
# print("\nFalse Negatives vs All Malicious URLs:")
# comparison_features = ['hostname_length', 'url_entropy', 'num_dots', 'num_slashes', 
#                       'num_digits', 'subdomain_count', 'consonant_pair_ratio']
                      
# for feature in comparison_features:
#     fn_avg = false_negatives[feature].mean()
#     mal_avg = true_malicious[feature].mean()
#     print(f"{feature}:")
#     print(f"  False Negatives: {fn_avg:.2f}")
#     print(f"  All Malicious:  {mal_avg:.2f}")

In [60]:
# knn = KNeighborsClassifier(n_neighbors=3)   
# knn.fit(X_train_scaled, y_train)

# y_pred = knn.predict(X_test_scaled)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))
