<a href="https://colab.research.google.com/github/MrDev333/PhishingDetect/blob/main/notebooks/ResolveURL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv('/content/malicious_phish.csv')
display(df.head())

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [2]:
# Data Preprocessing: Extract features from URLs
from urllib.parse import urlparse
import re

def extract_features(url):
    features = {}
    try:
        parsed_url = urlparse(url)
        features['scheme'] = parsed_url.scheme
        features['netloc'] = parsed_url.netloc
        features['path'] = parsed_url.path
        features['params'] = parsed_url.params
        features['query'] = parsed_url.query
        features['fragment'] = parsed_url.fragment
        features['domain'] = parsed_url.hostname
        features['length'] = len(url)
        features['num_dots'] = url.count('.')
        features['num_hyphens'] = url.count('-')
        features['num_at'] = url.count('@')
        features['num_question'] = url.count('?')
        features['num_ampersand'] = url.count('&')
        features['num_equals'] = url.count('=')
        features['num_exclamation'] = url.count('!')
        features['num_space'] = url.count(' ')
        features['num_tilde'] = url.count('~')
        features['num_comma'] = url.count(',')
        features['num_plus'] = url.count('+')
        features['num_asterisk'] = url.count('*')
        features['num_hash'] = url.count('#')
        features['num_dollar'] = url.count('$')
        features['num_percent'] = url.count('%')
        features['has_http'] = 'http' in parsed_url.scheme
        features['has_https'] = 'https' in parsed_url.scheme
        features['has_ftp'] = 'ftp' in parsed_url.scheme
        features['has_email'] = '@' in url

        # Check for IP address in domain
        if features['domain']:
            features['is_ip'] = bool(re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', features['domain']))
        else:
            features['is_ip'] = False


    except:
        pass # Handle potential errors during parsing
    return features

df_features = df['url'].apply(lambda x: pd.Series(extract_features(x)))

# Combine original dataframe with features
df_processed = pd.concat([df, df_features], axis=1)

display(df_processed.head())

Unnamed: 0,url,type,scheme,netloc,path,params,query,fragment,domain,length,...,num_plus,num_asterisk,num_hash,num_dollar,num_percent,has_http,has_https,has_ftp,has_email,is_ip
0,br-icloud.com.br,phishing,,,br-icloud.com.br,,,,,16,...,0,0,0,0,0,False,False,False,False,False
1,mp3raid.com/music/krizz_kaliko.html,benign,,,mp3raid.com/music/krizz_kaliko.html,,,,,35,...,0,0,0,0,0,False,False,False,False,False
2,bopsecrets.org/rexroth/cr/1.htm,benign,,,bopsecrets.org/rexroth/cr/1.htm,,,,,31,...,0,0,0,0,0,False,False,False,False,False
3,http://www.garage-pirenne.be/index.php?option=...,defacement,http,www.garage-pirenne.be,/index.php,,option=com_content&view=article&id=70&vsig70_0=15,,www.garage-pirenne.be,88,...,0,0,0,0,0,True,False,False,False,False
4,http://adventure-nicaragua.net/index.php?optio...,defacement,http,adventure-nicaragua.net,/index.php,,option=com_mailto&tmpl=component&link=aHR0cDov...,,adventure-nicaragua.net,235,...,0,0,0,0,0,True,False,False,False,False


In [3]:
# Prepare data for classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Select features and target variable
# Exclude non-numeric and potentially problematic columns
features = df_processed.drop(['url', 'type', 'scheme', 'netloc', 'path', 'params', 'query', 'fragment', 'domain'], axis=1)
target = df_processed['type']

# Convert boolean columns to numeric (0 or 1)
for col in features.columns:
    if features[col].dtype == 'bool':
        features[col] = features[col].astype(int)

# Handle potential missing values by filling with 0 (or another appropriate strategy)
features = features.fillna(0)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a classification model (e.g., RandomForestClassifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      benign       0.89      0.96      0.92     85778
  defacement       0.89      0.95      0.92     19104
     malware       0.92      0.84      0.88      6521
    phishing       0.69      0.39      0.50     18836

    accuracy                           0.87    130239
   macro avg       0.85      0.79      0.81    130239
weighted avg       0.86      0.87      0.86    130239

Accuracy Score: 0.8732637689171446
