In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("D:\Codes\python_Research\code_research\Phishing_legitimate_full.csv")

# Inspect the data
print(data.head())
print(data.columns)

# Assuming 'Class' is the target variable and all other columns are numerical features
X = data.drop(columns=['CLASS_LABEL'])
y = data['CLASS_LABEL']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

# SVM
svm = SVC(probability=True)
svm.fit(X_train, y_train)

# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr), 
    ('rf', rf), 
    ('svm', svm)
], voting='soft')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Predictions
y_pred = voting_clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

   NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  NumDashInHostname  \
0      3.0             1.0        5.0       72.0      0.0                0.0   
1      3.0             1.0        3.0      144.0      0.0                0.0   
2      3.0             1.0        2.0       58.0      0.0                0.0   
3      3.0             1.0        6.0       79.0      1.0                0.0   
4      3.0             0.0        4.0       46.0      0.0                0.0   

   AtSymbol  TildeSymbol  NumUnderscore  NumPercent  ...  IframeOrFrame  \
0       0.0          0.0            0.0         0.0  ...            0.0   
1       0.0          0.0            2.0         0.0  ...            0.0   
2       0.0          0.0            0.0         0.0  ...            0.0   
3       0.0          0.0            0.0         0.0  ...            0.0   
4       0.0          0.0            0.0         0.0  ...            1.0   

   MissingTitle  ImagesOnlyInForm  SubdomainLevelRT  UrlLengthRT  \


In [3]:
from sklearn.metrics import f1_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.969
F1-Score: 0.9695181907571289


In [4]:
lr_y_pred = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, lr_y_pred))

svm_y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, svm_y_pred))

rf_y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, rf_y_pred))

Accuracy: 0.9385
Accuracy: 0.965
Accuracy: 0.984


In [5]:
print(X_train[0])

[-0.32813661  0.5685909  -0.6962499  -0.87718099 -0.58053287 -0.25507224
 -0.01936855 -0.1158789  -0.2895828  -0.11641987 -0.33563401 -0.24214215
 -0.04748762 -0.18582144  0.10726546  0.95023659 -0.13394482 -0.14954671
 -0.85906749  0.          0.6383688  -1.03746254 -0.34944941 -0.02959335
 -0.29602225 -0.24671291 -0.27183702 -0.66253311 -0.4489566   0.43359104
 -0.57888987  2.99169664 -0.24699789  0.15344374 -0.52489066 -0.07436678
 -0.1223553  -0.07436678 -0.38248664 -0.72194304 -0.18545508 -0.17586311
  0.17601172  1.19273622  0.72511203 -1.53004905  1.10140944 -0.34751353]


In [6]:
print(X_train[1])

[ 0.41467406  0.5685909  -0.16287834 -0.90740997 -0.58053287 -0.25507224
 -0.01936855 -0.1158789  -0.2895828   1.47564673 -0.33563401 -0.24214215
 -0.04748762 -0.29045799  0.10726546  0.95023659 -0.13394482 -0.14954671
  1.16405289  0.         -0.71946873 -0.63179371 -0.34944941 -0.02959335
 -0.29602225  4.0532942  -0.70467286  1.57407202  2.22738678  0.43359104
 -0.57888987 -0.33425849 -0.24699789 -0.43777913  1.90515869 -0.07436678
 -0.1223553  -0.07436678 -0.38248664 -0.72194304 -0.18545508 -0.17586311
  0.17601172  1.19273622 -1.52810073  0.39454095 -1.55659223  0.76586583]
