In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Phishing_project

Mounted at /content/drive
/content/drive/My Drive/Phishing_project


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb

# Define the column names and read the CSV file
column_names = ['F1', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'Label']
df = pd.read_csv("new_(F1,F3-F15)super_df.csv", names=column_names, header=None)

# Remove the header row
df = df.iloc[1:]

# Replace '-1' with '0' and convert the Label column to integers
df['Label'] = df['Label'].replace('-1', '0').astype(int)

feature_columns = ['F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15']
df[feature_columns] = df[feature_columns].astype(float)

df['F1'] = df['F1'].apply(eval)

# Determine the length of the F1 vectors
max_length = max(df['F1'].apply(len))

# Pad F1 vectors to ensure they have the same length
df['F1'] = df['F1'].apply(lambda x: x + [0] * (max_length - len(x)))

# Flatten the F1 vectors and the other feature columns into a single 1D array
df['flattened_features'] = df.apply(lambda row: row['F1'] + row[feature_columns].tolist(), axis=1)

# Create a new dataframe with the flattened features and labels
flattened_df = pd.DataFrame(df['flattened_features'].tolist())
flattened_df['Label'] = df['Label'].values

# Verify that the flattened feature vectors have the correct length (212)
assert flattened_df.shape[1] == 214  # 212 features + 1 label column

# Check the distribution of each class
print(flattened_df['Label'].value_counts())

# Filter out rows where classes have fewer than 2 members
class_counts = flattened_df['Label'].value_counts()
df_filtered = flattened_df[flattened_df['Label'].isin(class_counts[class_counts >= 2].index)]

# Re-check the distribution after filtering
print(df_filtered['Label'].value_counts())

X = df_filtered.drop('Label', axis=1)
y = df_filtered['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Save the trained model
xgb_model.save_model("xgb_model.json")

# Predict and evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")


Label
0    32971
1    27275
Name: count, dtype: int64
Label
0    32971
1    27275
Name: count, dtype: int64
Accuracy: 0.9583402489626556
Precision: 0.978735743282428
Recall: 0.9281393217231897
F1 Score: 0.9527662777568686
ROC AUC: 0.9886337801392466
Confusion Matrix:
[[6485  110]
 [ 392 5063]]


In [28]:
flattened_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,204,205,206,207,208,209,210,211,212,Label
0,1,13,1,26,15,14,65,3,15,65,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0
1,1,13,5,18,9,3,1,14,85,2,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0
2,1,14,20,9,16,15,16,5,65,15,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0
3,1,14,4,18,15,9,4,11,9,14,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0
4,1,21,4,9,6,19,65,3,15,13,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60241,5,11,22,1,20,15,18,19,9,7,...,0.043478,0.804348,92.0,0.826087,0.173913,0.210526,0.0,0.0,0.000000,1
60242,5,11,22,1,20,15,18,19,9,7,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1
60243,5,12,85,13,1,18,11,65,3,15,...,0.000000,0.190476,84.0,0.130952,0.869048,6.636364,0.0,1.0,1.000000,1
60244,5,12,85,19,11,12,5,16,65,3,...,0.000000,0.759912,454.0,0.055066,0.944934,17.160000,0.0,2.0,0.500000,1


In [29]:
xgb_model.save_model("xgb_model.json")