In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
# Assume a CSV file with 'url' and 'label' where label is 0 (benign) or 1 (botnet)
df = pd.read_csv('malicious_phish.csv')
df.info()
df["label"].dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   label   651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB


0           phishing
1             benign
2             benign
3         defacement
4         defacement
             ...    
651186      phishing
651187      phishing
651188      phishing
651189      phishing
651190      phishing
Name: label, Length: 651191, dtype: object

In [3]:
# Feature extraction using TF-IDF (you can add more features as needed)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['url'])

In [4]:
# Encode the labels
df['label'] = df['label'].apply(lambda x: 1 if x == "phishing" else 0)
y=df['label']
df.head(50)
#df.tail(50)

Unnamed: 0,url,label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,0
4,http://adventure-nicaragua.net/index.php?optio...,0
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,0
6,espn.go.com/nba/player/_/id/3457/brandon-rush,0
7,yourbittorrent.com/?q=anthony-hamilton-soulife,0
8,http://www.pashminaonline.com/pure-pashminas,0
9,allmusic.com/album/crazy-from-the-heat-r16990,0


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=2, random_state=42)
clf.fit(X_train, y_train)

In [7]:
# Predict on the test set
y_pred = clf.predict(X_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.93
              precision    recall  f1-score   support

           0       0.94      0.99      0.96    111403
           1       0.93      0.59      0.72     18836

    accuracy                           0.93    130239
   macro avg       0.93      0.79      0.84    130239
weighted avg       0.93      0.93      0.93    130239



In [9]:
# Save the model for future use
import joblib
joblib.dump(clf, 'url_botnet_detector.pkl')

['url_botnet_detector.pkl']

In [10]:
import joblib

# Load the saved model
clf = joblib.load('url_botnet_detector.pkl')

# Example URL to predict
new_url = [""]

# Transform the URL using the same vectorizer
new_url_vectorized = vectorizer.transform(new_url)

# Predict
prediction = clf.predict(new_url_vectorized)
print(prediction)
print("Phishing" if prediction[0] == 1 else "not phishing")


[1]
Phishing
