In [2]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# --- 1. Load Data ---
# This assumes you have 'malicious_phish.csv' in your '../data/' folder
df = pd.read_csv('/Users/albertomartinez/Projects/python-ml-lab/data/malicious_phish.csv')
print(f"Dataset loaded with {len(df)} rows.")

# --- 2. Feature Engineering --- 
# Feature 2: Total URL Length
df['url_length'] = df['url'].apply(len)

# Feature 2: Count dots
df['num_dots'] = df['url'].apply(lambda url: url.count('.'))

# Feature 3: Presence of '@' symbol 
df['has_at_symbol'] = df['url'].apply(lambda x: 0 if x == 'benign' else 1)

# --- 3. Label Encoding
df['label'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)

# --- 4. Data Split for Modeling ---
features  = [ 'url_length', 'num_dots', 'has_at_symbol']
X = df[features]
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split( X, y, test_size=0.3, random_state=42, stratify=y)

# --- 5. Model Training and Evaluation ---
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# --- 6. Results --- 
print("\n--- Evaluation Results ---")
print("Classificaation Reports:")
print(classification_report(y_test, y_pred))
print("\n Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))







                                    




Dataset loaded with 651191 rows.

--- Evaluation Results ---
Classificaation Reports:
              precision    recall  f1-score   support

           0       0.69      0.94      0.79    128431
           1       0.60      0.17      0.27     66927

    accuracy                           0.68    195358
   macro avg       0.64      0.56      0.53    195358
weighted avg       0.66      0.68      0.61    195358


 Confusion Matrix: 
[[120798   7633]
 [ 55365  11562]]
