In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
data = pd.read_csv('dataset.csv')
data

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,http://www.fontspace.com/category/blackletter,45,17,0,2,0,0,0,0,0,...,0,0,0,448,5396,3980,0,0,6,legitimate
11426,http://www.budgetbots.com/server.php/Server%20...,84,18,0,5,0,1,1,0,0,...,1,0,0,211,6728,0,0,1,0,phishing
11427,https://www.facebook.com/Interactive-Televisio...,105,16,1,2,6,0,1,0,0,...,0,0,0,2809,8515,8,0,1,10,legitimate
11428,http://www.mypublicdomainpictures.com/,38,30,0,2,0,0,0,0,0,...,1,0,0,85,2836,2455493,0,0,4,legitimate


In [3]:
class WebsiteClassifier:
    def __init__(self):
        self.vectorizer = CountVectorizer()
        self.model = MultinomialNB()
    
    def preprocess_data(self, data):
        X = self.vectorizer.fit_transform(data['url'])
        y = data['status']
        return X, y
    
    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
    
    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report
    
    def classify_website(self, website):
        website_vectorized = self.vectorizer.transform([website])
        prediction = self.model.predict(website_vectorized)
        return prediction[0]
    


In [4]:
classifier = WebsiteClassifier()
X, y = classifier.preprocess_data(data)

In [5]:
X
y

0        legitimate
1          phishing
2          phishing
3        legitimate
4        legitimate
            ...    
11425    legitimate
11426      phishing
11427    legitimate
11428    legitimate
11429      phishing
Name: status, Length: 11430, dtype: object

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Size of training data:", X_train.shape[0])
print("Size of testing data:", X_test.shape[0])

Size of training data: 9144
Size of testing data: 2286


In [10]:
# Train the classifier
classifier.train(X_train, y_train)

In [11]:
# Evaluate the classifier
accuracy, report = classifier.evaluate(X_test, y_test)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.8989501312335958
Classification Report:
               precision    recall  f1-score   support

  legitimate       0.87      0.94      0.90      1157
    phishing       0.93      0.86      0.89      1129

    accuracy                           0.90      2286
   macro avg       0.90      0.90      0.90      2286
weighted avg       0.90      0.90      0.90      2286



In [12]:
# Example usage
website = "http://www.crestonwood.com/router.php"
prediction = classifier.classify_website(website)
print("Prediction for", website, ":", prediction)

Prediction for http://www.crestonwood.com/router.php : legitimate


In [13]:
# Example usage
website = "http://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4"
prediction = classifier.classify_website(website)
print("Prediction for", website, ":", prediction)

Prediction for http://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4 : phishing


In [14]:
website = "http://google.com-redirect@valimail.com "
prediction = classifier.classify_website(website)
print("Prediction for", website, ":", prediction)

Prediction for http://google.com-redirect@valimail.com  : phishing
