#DECISION TREE IMPLEMENTATION

In [31]:
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [32]:
#Importing the dataset
data=pd.read_csv("phishing.csv")

In [33]:
#Counting total number of rows and columns
data.shape

(11054, 32)

In [34]:
#Viewing Columns of data
data.columns

Index(['Index', 'UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon',
       'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 'AnchorURL',
       'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick',
       'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording',
       'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'LinksPointingToPage',
       'StatsReport', 'class'],
      dtype='object')

In [35]:
#Viewing top 5 rows of data
data.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [36]:
#deleting the index column
del data['Index']

In [6]:
#Counting total number of rows and columns
data.shape

(11054, 31)

In [37]:
data.columns

Index(['UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon',
       'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 'AnchorURL',
       'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick',
       'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording',
       'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'LinksPointingToPage',
       'StatsReport', 'class'],
      dtype='object')

In [38]:
#Viewing top 5 rows of dataset
data.head()

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [39]:
#Checking for null values
data.isnull().sum()

UsingIP                0
LongURL                0
ShortURL               0
Symbol@                0
Redirecting//          0
PrefixSuffix-          0
SubDomains             0
HTTPS                  0
DomainRegLen           0
Favicon                0
NonStdPort             0
HTTPSDomainURL         0
RequestURL             0
AnchorURL              0
LinksInScriptTags      0
ServerFormHandler      0
InfoEmail              0
AbnormalURL            0
WebsiteForwarding      0
StatusBarCust          0
DisableRightClick      0
UsingPopupWindow       0
IframeRedirection      0
AgeofDomain            0
DNSRecording           0
WebsiteTraffic         0
PageRank               0
GoogleIndex            0
LinksPointingToPage    0
StatsReport            0
class                  0
dtype: int64

In [129]:
x=data.iloc[:,:-1].values #Independent variables
y=data.iloc[:,30].values  #Dependent variables

In [130]:
print(x)

[[ 1  1  1 ...  1  1  1]
 [ 1  0  1 ...  1  0 -1]
 [ 1  0  1 ...  1 -1  1]
 ...
 [ 1 -1  1 ...  1  0  1]
 [-1 -1  1 ...  1  1  1]
 [-1 -1  1 ... -1  1 -1]]


In [131]:
print(y)

[-1 -1 -1 ... -1 -1 -1]


In [132]:
#Splitting the dataset into the Training and Test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [133]:
print(x_train)

[[ 1  1 -1 ...  1 -1  1]
 [ 1 -1  1 ...  1 -1  1]
 [ 1 -1  1 ...  1  1  1]
 ...
 [ 1 -1  1 ...  1  1  1]
 [ 1 -1  1 ...  1  0 -1]
 [ 1 -1  1 ...  1  0  1]]


In [134]:
print(y_train)

[1 1 1 ... 1 1 1]


In [135]:
print(x_test)

[[ 1 -1  1 ...  1  1  1]
 [ 1 -1  1 ...  1  0  1]
 [ 1  1  1 ...  1  0  1]
 ...
 [ 1 -1  1 ...  1  1 -1]
 [ 1 -1  1 ...  1  0  1]
 [ 1 -1  1 ...  1  1  1]]


In [136]:
print(y_test)

[ 1  1 -1 ...  1  1  1]


In [137]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()
x_train=sc_x.fit_transform(x_train)
x_test=sc_x.transform(x_test)

In [138]:
print(x_train)

[[ 0.72574341  2.12573205 -2.59206639 ...  0.39990904 -2.34805334
   0.40254308]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904 -2.34805334
   0.40254308]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904  1.14130082
   0.40254308]
 ...
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904  1.14130082
   0.40254308]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904 -0.60337626
  -2.48420617]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904 -0.60337626
   0.40254308]]


In [139]:
print(x_test)

[[ 0.72574341 -0.48015611  0.38579259 ...  0.39990904  1.14130082
   0.40254308]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904 -0.60337626
   0.40254308]
 [ 0.72574341  2.12573205  0.38579259 ...  0.39990904 -0.60337626
   0.40254308]
 ...
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904  1.14130082
  -2.48420617]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904 -0.60337626
   0.40254308]
 [ 0.72574341 -0.48015611  0.38579259 ...  0.39990904  1.14130082
   0.40254308]]


In [140]:
#We have made a classifier for making a decision tree and to train the data using the classifier
# Fitting decision tree model on data
decision_classifier= DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
decision_classifier.fit(x_train,y_train)


DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

In [141]:
# Predicting the test set result
y_predicted=decision_classifier.predict(x_test)
y_pred_train=decision_classifier.predict(x_train)

In [142]:
#Making the confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,y_predicted)

In [143]:
print(confusion_matrix)

[[1116   66]
 [ 169 1413]]


In [144]:
from sklearn.metrics import accuracy_score
accuracy_score=accuracy_score(y_test, y_predicted)

In [145]:
print(accuracy_score*100)

91.49782923299566
