In [63]:
!pip install numpy
!pip install pandas
!pip install scikit-learn



In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [68]:
def process_dataset(df):
    # Convert "type" to binary digits
    df['type'] = df['type'].apply(lambda x: 0 if x == 'benign' else 1)
    
    # Convert url to lowercase
    df['url'] = df['url'].str.lower()

    # Remove port from consideration
    df['url'] = df['url'].str.split(":").str[0]

    # Determine if the url is an HTTPS address (secure)
    df['is_https'] = df['url'].apply(lambda x: 1 if 'https://' in x else 0)

    # Purge http(s):// -- it's just noise.
    df['url'] = df['url'].str.replace('http://', '')
    df['url'] = df['url'].str.replace('https://', '')

    # Record the length of each URL
    df['url_length'] = df['url'].apply(len)

    # Find the full domain (including subdomains)
    df['full_domain'] = df['url'].str.split("/").str[0]

    # Count the number of subdomains
    df['subdomain_count'] = df['full_domain'].apply(lambda x: len(x.split('.')) - 1)
    
    # Count the number of special characters found in the address
    df['count_special_chars'] = df['url'].apply(lambda u: sum(u.count(c) for c in '!@#$%^&*()[]{};:,./<>?|`~-=+'))
    
    # Record the ratio between numerical digits and the total length of the URL
    df['digit_to_length_ratio'] = df['url'].apply(lambda u: sum(c.isdigit() for c in u) / len(u) if len(u) > 0 else 0)
    
    # Remove URL and other non-numeric artifacts
    df.drop(['url', 'full_domain'], axis=1, inplace=True)

In [69]:
DF_full = pd.read_csv("malicious_phish.csv")
process_dataset(DF_full)

X_full = DF_full.drop('type', axis=1)
y_full = DF_full['type']
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)


In [70]:
# Test my data prep strategy using pre-built libraries...

# Train logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict and evaluate
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8195394620658942
              precision    recall  f1-score   support

           0       0.88      0.84      0.86     85778
           1       0.72      0.78      0.75     44461

    accuracy                           0.82    130239
   macro avg       0.80      0.81      0.80    130239
weighted avg       0.82      0.82      0.82    130239

