In [253]:
import pandas as pd 
import numpy as np 
import time 

from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer 
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline 
import pickle

import warnings  
warnings.filterwarnings('ignore')

In [254]:
phish_data = pd.read_csv('phishing_site_urls.csv')
phish_data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [255]:
phish_data.tail()

Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [256]:
phish_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [257]:
phish_data.isnull().sum()

URL      0
Label    0
dtype: int64

In [258]:
label_counts = pd.DataFrame(phish_data.Label.value_counts())
print(label_counts)  # Inspect the structure

# Ensure the DataFrame has appropriate column names
label_counts.reset_index(inplace=True)
label_counts.columns = ['Label', 'Counts']

        count
Label        
good   392924
bad    156422


In [259]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [260]:
phish_data.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [261]:
print('Getting words tokenized ...')
t0= time.perf_counter()
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words tokenized ...
Time taken 1.9779348000010941 sec


In [262]:
stemmer = SnowballStemmer("english")

In [263]:
print('Getting words stemmed ...')
t0= time.perf_counter()
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words stemmed ...
Time taken 49.79863539998769 sec


In [264]:
print('Getting joining words ...')
t0= time.perf_counter()
phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting joining words ...
Time taken 0.319253599998774 sec


In [265]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
10411,www.intr-paypal.com/,bad,"[www, intr, paypal, com]","[www, intr, paypal, com]",www intr paypal com
116188,neoss9.com/listings/ipad/,bad,"[neoss, com, listings, ipad]","[neoss, com, list, ipad]",neoss com list ipad
335694,fantasticfiction.co.uk/m/norah-mcclintock/down...,good,"[fantasticfiction, co, uk, m, norah, mcclintoc...","[fantasticfict, co, uk, m, norah, mcclintock, ...",fantasticfict co uk m norah mcclintock down htm
52425,www.world-playground.com/com.htm,good,"[www, world, playground, com, com, htm]","[www, world, playground, com, com, htm]",www world playground com com htm
137739,vin-italy.com/passss/yahuu.html,bad,"[vin, italy, com, passss, yahuu, html]","[vin, itali, com, passss, yahuu, html]",vin itali com passss yahuu html


In [266]:
trainX, testX, trainY, testY = train_test_split(phish_data.text_sent, phish_data.Label, test_size=0.2, random_state=42)

In [267]:
cv = CountVectorizer()

In [268]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression(C=1.0, penalty='l2', solver='liblinear'))

In [269]:
#trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)

In [270]:
pipeline_ls.fit(trainX,trainY)

In [271]:
scores = cross_val_score(pipeline_ls, phish_data['text_sent'], phish_data['Label'], cv=5)
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

Cross-validation scores: [0.70573405 0.9594062  0.93211916 0.95927878 0.9609353 ]
Mean cross-validation score: 0.9034946968953038


In [272]:
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))

print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
                            target_names =['Bad','Good']))


Training Accuracy : 0.9815530313373199
Testing Accuracy : 0.9668881405297169

CLASSIFICATION REPORT
              precision    recall  f1-score   support

         Bad       0.91      0.97      0.94     29486
        Good       0.99      0.97      0.98     80384

    accuracy                           0.97    109870
   macro avg       0.95      0.97      0.96    109870
weighted avg       0.97      0.97      0.97    109870


In [273]:
pipeline_mnb = make_pipeline(cv, MultinomialNB())
pipeline_mnb.fit(trainX, trainY)

In [274]:
print('Training Accuracy :',pipeline_mnb.score(trainX,trainY))
print('Testing Accuracy :',pipeline_mnb.score(testX,testY))

print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

Training Accuracy : 0.9742670817063958
Testing Accuracy : 0.9632838809502139

CLASSIFICATION REPORT
              precision    recall  f1-score   support

         Bad       0.91      0.96      0.93     29532
        Good       0.98      0.96      0.97     80338

    accuracy                           0.96    109870
   macro avg       0.95      0.96      0.95    109870
weighted avg       0.96      0.96      0.96    109870


In [275]:
pickle.dump(pipeline_ls,open('phishing_ls.pkl','wb'))
pickle.dump(pipeline_mnb,open('phishing_mnb.pkl','wb'))

In [276]:
loaded_model_ls = pickle.load(open('phishing_ls.pkl', 'rb'))
loaded_model_mnb = pickle.load(open('phishing_mnb.pkl','rb'))
result_ls = loaded_model_ls.score(testX,testY)
result_mnb = loaded_model_mnb.score(testX, testY)
print("result_ls = ",result_ls)
print("result_mnb = ",result_mnb)

result_ls =  0.9668881405297169
result_mnb =  0.9632838809502139


In [277]:
with open('phishing_ls.pkl', 'rb') as file1:
    df1 = pickle.load(file1)

# Load the second DataFrame from another pickle file
with open('phishing_mnb.pkl', 'rb') as file2:
    df2 = pickle.load(file2)

# Concatenate the DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new pickle file
with open('phishing.pkl', 'wb') as file_out:
    pickle.dump(combined_df, file_out)

TypeError: cannot concatenate object of type '<class 'sklearn.pipeline.Pipeline'>'; only Series and DataFrame objs are valid