In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import joblib
import pickle

# Load the dataset
df = pd.read_csv('malicious_phish.csv')
X = df['url']
y = df['type']

# Preprocess the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_one_hot = to_categorical(y_encoded)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=500)
X_tfidf = vectorizer.fit_transform(X)

# Save the vectorizer using joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y_one_hot, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Build the neural network model for multi-class classification
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Softmax for multi-class

# Compile the model with categorical crossentropy as the loss function
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True, mode='max')  # Monitor val_accuracy and stop when it stops increasing
history = model.fit(X_train.toarray(), y_train, epochs=100, batch_size=1000, validation_data=(X_val.toarray(), y_val), callbacks=[early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test.toarray(), y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# Save the trained model and label encoder using joblib
joblib.dump(label_encoder, 'label_encoder_multiclass.joblib')

# Save the model in .h5 format
model.save('neural_network_model_multiclass.h5')





Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Test Loss: 0.16729767620563507, Test Accuracy: 0.9431151747703552


  saving_api.save_model(


In [2]:
import pandas as pd

# Read the existing CSV file
existing_data = pd.read_csv('malicious_phish.csv')

# Read the new CSV files
benign_df = pd.read_csv('Benign_list_big_final.csv', header=None, names=['url'])
defacement_df = pd.read_csv('DefacementSitesURLFiltered.csv', header=None, names=['url'])
malware_df = pd.read_csv('Malware_dataset.csv', header=None, names=['url'])
phishing_df = pd.read_csv('phishing_dataset.csv', header=None, names=['url'])
spam_df = pd.read_csv('spam_dataset.csv', header=None, names=['url'])

# Add a 'Type' column and assign the corresponding type
benign_df['type'] = 'benign'
defacement_df['type'] = 'defacement'
malware_df['type'] = 'malware'
phishing_df['type'] = 'phishing'
spam_df['type'] = 'spam'

# Concatenate all DataFrames
all_df = pd.concat([existing_data, benign_df, defacement_df, malware_df, phishing_df, spam_df], ignore_index=True)

# Save the merged DataFrame to the CSV file
all_df.to_csv('Combined_dataset.csv', index=False, header=False)


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import joblib
import pickle

csv_files = ['Combined_dataset.csv', 'Webpages_Classification_test_data.csv','Webpages_Classification_train_data.csv']

# Read data from each CSV file and concatenate into a single DataFrame
dfs = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file, header=None, names=['url','type'])
    dfs.append(df)

# Concatenate all DataFrames
df_combined = pd.concat(dfs, ignore_index=True)
#print(df_combined)
# Read the combined DataFrame
X = df_combined['url']
y = df_combined['type']

# Preprocess the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_one_hot = to_categorical(y_encoded)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(X)

# Save the vectorizer using joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y_one_hot, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Build the neural network model for multi-class classification
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Softmax for multi-class

# Compile the model with categorical crossentropy as the loss function
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True, mode='max')  # Monitor val_accuracy and stop when it stops increasing
history = model.fit(X_train.toarray(), y_train, epochs=100, batch_size=750, validation_data=(X_val.toarray(), y_val), callbacks=[early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test.toarray(), y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# Save the trained model and label encoder using joblib
joblib.dump(label_encoder, 'label_encoder_multiclass.joblib')

# Save the model in .h5 format
model.save('neural_network_model_multiclass.h5')





Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Test Loss: 0.1127631813287735, Test Accuracy: 0.9694477915763855


  saving_api.save_model(
