In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/Phishing_project

/content/drive/My Drive/Phishing_project


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [4]:
column_names = ['F1', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'Label']
df = pd.read_csv("new_(F1,F3-F15)super_df.csv", names=column_names, header=None)

# Remove the header row
df = df.iloc[1:]

# Replace '-1' with '0' and convert the Label column to integers
df['Label'] = df['Label'].replace('-1', '0').astype(int)

In [5]:
feature_columns = ['F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15']
df[feature_columns] = df[feature_columns].astype(float)

In [6]:
df['F1'] = df['F1'].apply(eval)

# Determine the length of the F1 vectors
max_length = max(df['F1'].apply(len))

# Pad F1 vectors to ensure they have the same length
df['F1'] = df['F1'].apply(lambda x: x + [0] * (max_length - len(x)))

# Split the F1 vectors into separate columns
F1_df = pd.DataFrame(df['F1'].tolist(), index=df.index)
F1_df.columns = [f'F1_{i}' for i in range(F1_df.shape[1])]

# Concatenate F1_df with the original dataframe
df = pd.concat([F1_df, df.drop(columns=['F1'])], axis=1)

# Check the distribution of each class
print(df['Label'].value_counts())

# Filter out rows where classes have fewer than 2 members
class_counts = df['Label'].value_counts()
df_filtered = df[df['Label'].isin(class_counts[class_counts >= 2].index)]

# Re-check the distribution after filtering
print(df_filtered['Label'].value_counts())

X = df_filtered.drop('Label', axis=1)
y = df_filtered['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Label
0    32971
1    27275
Name: count, dtype: int64
Label
0    32971
1    27275
Name: count, dtype: int64
Accuracy: 0.9583402489626556
Precision: 0.978735743282428
Recall: 0.9281393217231897
F1 Score: 0.9527662777568686
ROC AUC: 0.9886337801392466
Confusion Matrix:
[[6485  110]
 [ 392 5063]]


In [None]:
booster = xgb_model.get_booster()
booster.dump_model('xgb.model.json', with_stats=True, dump_format='json')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import os

In [None]:
feature_map_file = 'feature_map.txt'
with open(feature_map_file, 'w') as f:
    for i, col in enumerate(X.columns):
        f.write(f"{i}\t{col}\tq\n")

# Save the model in JSON format with the feature map
output_path = os.path.join(os.getcwd(), 'xgb_model.json')
booster = xgb_model.get_booster()
booster.dump_model(output_path, feature_map_file, dump_format='json')

print(f"Model and feature map saved to: {output_path} and {feature_map_file}")

Model and feature map saved to: /content/drive/MyDrive/Phishing_project/xgb_model.json and feature_map.txt


In [None]:
new_data = pd.read_csv('benign_2.csv')


new_data['F1'] = new_data['F1'].apply(eval)
new_data['F1'] = new_data['F1'].apply(lambda x: x + [0] * (max_length - len(x)))

F1_new_df = pd.DataFrame(new_data['F1'].tolist(), index=new_data.index)
F1_new_df.columns = [f'F1_{i}' for i in range(F1_new_df.shape[1])]

new_data_processed = pd.concat([F1_new_df, new_data.drop(columns=['F1'])], axis=1)

new_data_processed = new_data_processed[X.columns]

new_predictions = xgb_model.predict(new_data_processed)
new_probabilities = xgb_model.predict_proba(new_data_processed)[:, 1]

print("Predictions:", new_predictions)
print("Probabilities:", new_probabilities)


Predictions: [1 1 0]
Probabilities: [0.77923745 0.64450586 0.04513134]
