# Step 1: Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load and Explore the Dataset

In [None]:
df = pd.read_csv("malicious.csv")
print(df.head())
print(df['type'].value_counts())

# Step 3: Preprocessing URLs
### Extract Features from URLs

In [None]:
def extract_features(url):
    features = {}
    features['length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
    return features

df_features = df['url'].apply(lambda x: extract_features(str(x)))
df_features = pd.DataFrame(df_features.tolist())

# Step 4: Convert Text Data to Numerical Format

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
x_tfidf = tfidf.fit_transform(df['url'])

# Step 5: Prepare Training and Testing Data

In [None]:
from scipy.sparse import hstack, csr_matrix  # Ensure csr_matrix is explicitly imported

df_features_sparse = csr_matrix(df_features.values)  # Convert to sparse matrix
X = hstack((x_tfidf, df_features_sparse))
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a Machine Learning Model

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Evaluate the Model

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 8: Predict New URLs

In [None]:
def predict_url(url):
    features = extract_features(url)
    features_df = pd.DataFrame([features])
    features_sparse = csr_matrix(features_df.values)  # Convert to sparse matrix
    url_tfidf = tfidf.transform([url])
    X_new = hstack((url_tfidf, features_sparse))
    return model.predict(X_new)[0]

print(predict_url("http://malicious-example.com"))