<a href="https://colab.research.google.com/github/ACM-Research/fraudulent-website-detection/blob/main/SupportVectorMachine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import our data here
import pandas as pd
import numpy as np
import io
import requests
import re
from google.colab import files

# Load libraries
from sklearn import svm
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

#sampleData = pd.read_csv('sample.csv')
# Read the data in
#df = pd.read_csv("malicious_phish.csv")
df = pd.read_csv('sample2.csv')
url_list = df.url

In [None]:
# Rules for us to implement, or "flags"
# 1.  If url contains "%00" 
def containsPercent00(str):
  if str.find("%00"):
    return 1
  else:
    return 0

# 2.  If url contains "%01"
def containsPercent01(str):
  if str.find("%01"):
    return 1
  else:
    return 0

# 3.  If url contains '-' if more than four, then fradulent
def containsFourDash(str):
  if str.count("-") == 4:
    return 1
  else:
    return 0


# 4.  If (perform get request) url returned != url sent 
def checkIf_URLsent_equals_getURL(str):
  temp = (requests.head("http://"+str).headers['location'])
  if temp != str:
    return 1
  else:
    return 0

# 5.  If url contains more than one instance of "http://"
def containsMoreHttp(str):
  if str.count("http://") > 1:
    return 1
  else:
    return 0


# 6.  If url contains more than one instance of "https://"
def containsMoreHttps(str):
  if str.count("https://") > 1:
    return 1
  else:
    return 0

# 7.  If url contains "http://" AND "https://"
def containsHttpAndHttps(str):
  if "http://" in str and "https://" in str:
    return 1
  else:
    return 0



# 8.  If url is over 54 characters
def over54Chars(str):
  if len(str) > 54:
    return 1
  else:
    return 0


# 9.  If url has more than one instance of a domain extension
def hasMoreExtension(str):
  if ".com" in str and ".net" in str and ".org" in str:
    return 1
  else:
    return 0


# 10. If url does not contain "https://"
def doesNotContainHttps(str):
  if "https://" not in str:
    return 1
  else:
    return 0

# 11. If url contains "index.php"
def overIndexPhp(str):
  if str.find("index.php"):
    return 1
  else:
    return 0
# 12. If url has "@"
def hasAt(str):
  if "@" in str:
    return 1
  else:
    return 0

# 13. If url has "//" after seventh position
def hasTwoSlash(str):
  if str.find("//")!=-1 and str.find("//")>7:
    return 1
  else:
    return 0
  
# 14. If url has multiple subdomains  
def hasMultDom(str):
  dom = re.search("(?<=:\/\/)[^/]*", str)
  if(not dom): return 0
  if(dom.group(0).count(".")>2): return 1
  return 0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix, hstack

#reading from sample file with file upload(in sidebar)
#df = pd.read_csv('sample2.csv')
#making pipeline to extract and adjust features for training
url_list = df.url
pipe = Pipeline(
    [
     ('count',CountVectorizer()),
     ('tfidf',TfidfTransformer()),
     ('scale',StandardScaler(with_mean=False)),
     ('feature selection', VarianceThreshold())
     ])
pipe.fit(url_list)
Xa = pipe.transform(url_list)
#using above flags for 11 other features
Xb = []
for i in range(0,url_list.size):
    new = []
    new.append(containsPercent00(url_list[i]))
    new.append(containsPercent01(url_list[i]))
    new.append(containsFourDash(url_list[i]))
    #new.append(checkIf_URLsent_equals_getURL(url_list[i]))
    new.append(containsMoreHttp(url_list[i]))
    new.append(containsMoreHttps(url_list[i]))
    new.append(over54Chars(url_list[i]))
    new.append(hasMoreExtension(url_list[i]))
    new.append(doesNotContainHttps(url_list[i]))
    new.append(overIndexPhp(url_list[i]))
    new.append(hasAt(url_list[i]))
    new.append(hasTwoSlash(url_list[i]))
    new.append(hasMultDom(url_list[i]))
    Xb.append(new)
#cocatenating the two feature matrices
X = hstack([Xa,csr_matrix(Xb)])
#generating labels from data
result = df.type
y = []
for i in range (0,result.size):
    if result[i]=="benign": y.append(0)
    else: y.append(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#splitting into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=.3)

#using sci kit learn's logistic regression function to create a model
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

print("accuracy of support vector machine:")
print(clf.score(X_test,y_test))
print("precision:")
print(precision_score(y_test,clf.predict(X_test)))
print("AUC:")
print(roc_auc_score(y_test,clf.predict(X_test)))
print("Recall:")
print(recall_score(y_test,clf.predict(X_test)))
print("F1:")
print(f1_score(y_test,clf.predict(X_test)))

accuracy of support vector machine:
0.9326666666666666
precision:
0.9925373134328358
AUC:
0.8635025043751132
Recall:
0.7287671232876712
F1:
0.8404423380726699


In [None]:
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1133    2]
 [  99  266]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1135
           1       0.99      0.73      0.84       365

    accuracy                           0.93      1500
   macro avg       0.96      0.86      0.90      1500
weighted avg       0.94      0.93      0.93      1500



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
import seaborn as sns; sns.set()

h = .02

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = svm.LinearSVC(C=C).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel']


for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[i])

plt.show()

TypeError: ignored