# Malicious Url Predictor 

# Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack, csr_matrix

# 1. import pandas as pd
- Purpose: pandas is a Python library used for data manipulation and analysis.
- Usage in the project:
    - Reads the dataset (pd.read_csv()).
    - Handles and processes tabular data using DataFrame.
    - Extracts and organizes features for model training.

# 2. import numpy as np
- Purpose: numpy is used for numerical computations and handling arrays.
- Usage in the project:
    - Converts data into numerical format.
    - Performs mathematical operations on dataset features.
 
# 3. import re
- Purpose: re is the built-in Python module for working with regular expressions (pattern matching in text).
- Usage in the project:
    - Extracts numerical and special character features from URLs.
    - Detects patterns that might indicate malicious behavior.
 
# 4. import string
- Purpose: string is a built-in Python module that provides constants like alphabets and punctuation.
- Usage in the project:
    - Used to filter or manipulate textual data.
    - Can be helpful in preprocessing URLs.
 
# 5. from sklearn.model_selection import train_test_split
- Purpose: train_test_split is used to split the dataset into training and testing subsets.
- Usage in the project:
    - Divides the data so the model is trained on one portion and tested on another to evaluate its performance.
 
# 6. from sklearn.feature_extraction.text import TfidfVectorizer
- Purpose: TfidfVectorizer converts text data (URLs) into numerical format using TF-IDF (Term Frequency - Inverse Document Frequency).
- Usage in the project:
    - Extracts important keywords from URLs to help in classification.
    - Reduces the impact of commonly occurring words.
 
# 7. from sklearn.ensemble import RandomForestClassifier
- Purpose: RandomForestClassifier is a machine learning algorithm that builds multiple decision trees and combines them to improve accuracy.
- Usage in the project:
    - Classifies URLs as benign, phishing, or malware.
    - Works well with a mix of numerical and categorical features.
 
# 8. from sklearn.metrics import accuracy_score, classification_report
- Purpose: These functions evaluate the performance of the trained model.
- Usage in the project:
    - accuracy_score(): Measures how many predictions were correct.
    - classification_report(): Provides precision, recall, and F1-score for each class
 
# 9. from scipy.sparse import hstack, csr_matrix
- Purpose: hstack and csr_matrix handle sparse matrices, which are memory-efficient representations of large datasets with many zero values.
- Usage in the project:
    - csr_matrix converts extracted URL features into a sparse matrix format.
    - hstack combines TF-IDF vectors with extracted numerical features.

In [2]:
df = pd.read_csv("malicious.csv")
print(df.head())
print(df['type'].value_counts())

                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: type, dtype: int64


# 1. df = pd.read_csv("malicious.csv")
- Purpose: This reads the CSV file "malicious.csv" into a pandas DataFrame (df).
- How it works:
    - pd.read_csv() is used to load data from a CSV file.
    - "malicious.csv" is the file containing the dataset (which we assume has a list of URLs along with their labels).
    - The result is stored in df, which is a pandas DataFrame (a tabular data structure similar to an Excel spreadsheet).

# 2. print(df.head())
- Purpose: Displays the first 5 rows of the dataset.
- How it works:
    - .head() retrieves the top 5 rows of the DataFrame.
    - Helps to quickly inspect the data structure and verify if it was loaded correctly.
 
# 3. print(df['type'].value_counts())
- Purpose: Counts the number of occurrences of each category in the type column.
- How it works:
    - df['type'] extracts the 'type' column (which contains labels like benign, phishing, malware).
    - .value_counts() counts how many times each unique value appears in that column.
    - This helps identify class imbalance in the dataset.

In [3]:
def extract_features(url):  #This function takes a URL as input and extracts numerical features from it.
    features = {}           #Creates an empty dictionary to store extracted features.
    features['length'] = len(url)   #Computes and stores the total number of characters in the URL.
    features['num_digits'] = sum(c.isdigit() for c in url)    #Counts how many numeric characters (0-9) appear in the URL.
    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))   #Uses a regular expression to count the number of characters that are not letters or digits.
    return features         #Returns the dictionary containing extracted numerical values.

df_features = df['url'].apply(lambda x: extract_features(str(x)))

"""Applies the extract_features function to every URL in the dataset.
Converts the URLs to string format to prevent errors."""

df_features = pd.DataFrame(df_features.tolist())    #Converts the extracted features into a structured DataFrame.
df_features.fillna(0, inplace=True)  # Replaces any missing (NaN) values with 0 to ensure consistency in the dataset.

In [4]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
x_tfidf = tfidf.fit_transform(df['url'])

# 1. Create a TF-IDF Vectorizer Object
- TfidfVectorizer converts text (URLs) into numerical features using Term Frequency-Inverse Document Frequency (TF-IDF).
- stop_words='english' removes common English words (like "the", "and", "is") to reduce noise.
- max_features=5000 limits the number of unique words to 5,000, reducing memory usage.

# 2. Apply TF-IDF Transformation to the URL Column
- fit_transform(df['url']) processes the URLs by:
    - Tokenizing (splitting URLs into words).
    - Removing stop words.
    - Calculating the TF-IDF scores for words.
    - Converting the result into a sparse matrix (efficient memory storage).


In [5]:
from scipy.sparse import hstack, csr_matrix  # Ensure both are explicitly imported

df_features_sparse = csr_matrix(df_features.values)  # Convert to sparse matrix
X = hstack((x_tfidf, df_features_sparse))
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Import Required Functions
- csr_matrix (Compressed Sparse Row matrix) efficiently stores numerical data with many zero values, saving memory.
- hstack horizontally stacks (combines) two or more sparse matrices.

# 2. Convert Extracted Features into a Sparse Matrix
- csr_matrix(df_features.values) converts them into a sparse matrix for efficient memory storage.

# 3. Combine TF-IDF Features and Extracted Features
- hstack((x_tfidf, df_features_sparse)) merges:
    - x_tfidf (text-based TF-IDF features).
    - df_features_sparse (manually extracted numerical features).
- This ensures the model considers both textual and numerical attributes of URLs.

# 4. Define the Target Variable
- df['type'] contains labels (e.g., "benign", "malicious", "phishing").
- This becomes the target variable (y) for model training.

# 5. Split Data into Training and Testing Sets
- train_test_split(X, y, test_size=0.2, random_state=42) splits the dataset into:
    - 80% training data (X_train, y_train) → Used for model training.
    - 20% testing data (X_test, y_test) → Used for model evaluation.
- random_state=42 ensures consistent results across multiple runs.


In [6]:
# Reduce dataset size if memory issues occur
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_small, y_train_small)

# 1. Reduce Dataset Size to Handle Memory Issues
- train_test_split(X_train, y_train, test_size=0.9, random_state=42) randomly selects a small subset (10%) of X_train and y_train for training.
- test_size=0.9 means 90% of X_train is discarded to reduce memory usage.
- _ underscore variables) store discarded data, as they are not needed.
- This is useful when dealing with large datasets that cause memory overflow.

# 2. Initialize the Random Forest Classifier
- RandomForestClassifier(n_estimators=100, random_state=42) creates a Random Forest model with:
    - n_estimators=100 → Uses 100 decision trees for better accuracy.
    - random_state=42 → Ensures consistent model training across multiple runs.
- Random Forest is chosen because:
    - It handles imbalanced datasets well.
    - It reduces overfitting by averaging multiple decision trees.

# 3. Train the Model on the Smaller Dataset
- fit(X_train_small, y_train_small) trains the model using the reduced dataset.
- This step learns patterns from X_train_small and their corresponding labels y_train_small.
- Using a smaller dataset speeds up training but may slightly reduce accuracy.


In [7]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9549520496932562
Classification Report:
               precision    recall  f1-score   support

      benign       0.96      0.99      0.97     85778
  defacement       0.96      0.99      0.97     19104
     malware       0.99      0.92      0.95      6521
    phishing       0.91      0.79      0.85     18836

    accuracy                           0.95    130239
   macro avg       0.95      0.92      0.94    130239
weighted avg       0.95      0.95      0.95    130239



# 1. Make Predictions on the Test Data
- model.predict(X_test) uses the trained Random Forest model to predict labels for X_test.
- X_test contains unseen URLs, so this evaluates how well the model generalizes to new data.
- The predicted labels are stored in y_pred.

# 2. Calculate and Print Accuracy
- accuracy_score(y_test, y_pred) computes the proportion of correctly predicted labels.
- Formula:
    - **Accuracy = Number of correct predictions/Total number of predictions**
- The result is printed as "Accuracy: <value>".

# 3. Generate a Detailed Classification Report
- classification_report(y_test, y_pred) generates a detailed performance evaluation, including:
    - Precision: Out of all predicted positive cases, how many were actually positive?
    - Recall (Sensitivity): Out of all actual positive cases, how many were correctly predicted?
    - F1-score: A balance between precision and recall.
    - Support: The number of true instances for each class.
- This helps assess if the model is biased towards a particular class.

In [9]:
def predict_url(url):
    features = extract_features(url)
    features_df = pd.DataFrame([features])
    features_sparse = csr_matrix(features_df.values)  # Convert to sparse matrix
    url_tfidf = tfidf.transform([url])
    X_new = hstack((url_tfidf, features_sparse))
    return model.predict(X_new)[0]

test_urls = [
    "http://safe-website.com",
    "http://phishing-example.com",
    "http://malicious-download.net"
]

for url in test_urls:
    print(url, "->", predict_url(url))

http://safe-website.com -> defacement
http://phishing-example.com -> phishing
http://malicious-download.net -> phishing


# 1. Define a Function to Predict a URL's Category
- This function takes a URL string as input and predicts whether it is benign, phishing, or malicious.

# 2. Extract Numerical Features from the URL
- Calls the extract_features() function to compute:
    - URL length
    - Number of digits
    - Number of special characters
- These features help detect suspicious patterns.

# 3. Convert Features into a Pandas DataFrame
- The extracted features are stored in a DataFrame.
- **[features]** wraps the dictionary into a list so it can be converted into a single-row DataFrame.

# 4. Convert the Feature DataFrame into a Sparse Matrix
- Why convert to a sparse matrix?
    - The hstack() function later requires both the feature matrix and TF-IDF matrix to be in sparse format.
    - Reduces memory consumption.

# 5. Convert the URL Text into a TF-IDF Vector
- The TF-IDF vectorizer (tfidf) transforms the text content of the URL into a numerical vector.
- transform([url]) ensures the input is correctly formatted as a list.

# 6. Combine TF-IDF and Extracted Features
- hstack() horizontally stacks:
    - TF-IDF representation of the URL.
    - Extracted numerical features (length, digits, special characters).
- This creates a complete feature set for prediction.

# 7. Make a Prediction Using the Model
- model.predict(X_new) returns an array with the predicted label.
- **[0]** extracts the first element since we predict for only one URL at a time.
- Iterates through the test_urls list.
- Calls predict_url(url) for each URL.
