In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

In [2]:
df = pd.read_csv("./phishing_site_urls.csv")

print(df.head(4))
print(df.info())

                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB
None


In [3]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
tokenizer.tokenize(df.URL[0])
print('Getting words tokenized ...')
t0= time.perf_counter()
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words tokenized ...
Time taken 1.9474677999999699 sec


In [4]:
stemmer = SnowballStemmer("english")
print('Getting words stemmed ...')
t0= time.perf_counter()
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

df.sample(5)

print('Get joiningwords ...')
t0= time.perf_counter()
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words stemmed ...
Time taken 27.28706649999998 sec
Get joiningwords ...
Time taken 0.14090870000001132 sec


In [5]:
bad_sites = df[df.Label == 'bad']
good_sites = df[df.Label == 'good']

In [6]:
cv = CountVectorizer()
feature = cv.fit_transform(df.text_sent)
feature[:5].toarray()
trainX, testX, trainY, testY = train_test_split(feature, df.Label)

lr = LogisticRegression(max_iter=1000)
lr.fit(trainX,trainY)

In [24]:
print(lr.score(testX,testY))

def preprocess_url(url):
    # Tokenization
    tokenized = tokenizer.tokenize(url)
    # Stemming
    stemmed = [stemmer.stem(word) for word in tokenized]
    # Convert to text sentence
    text_sentence = ' '.join(stemmed)
    # Vectorization
    vectorized = cv.transform([text_sentence])
    return vectorized

# Example URL
new_url = 'torevisioncenters.com/html/technology.html'
processed_url = preprocess_url(new_url)

# Prediction
prediction = lr.predict(processed_url)
print(prediction)
print("The URL is predicted as:", "Bad" if prediction[0] == 'bad' else "Good")

0.9656975177847194
['good']
The URL is predicted as: Good
