<a href="https://colab.research.google.com/github/Dineshdina-IT/Lab/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset as a CSV file
file_path = 'dataset_phishing.csv'
data = pd.read_csv(file_path)

print("File loaded successfully as CSV!")
print(data.head())  # Display first 5 rows

# Step 2: Handle missing values (if any)
data = data.dropna()
print(f"Dataset after dropping missing values: {data.shape}")

# Step 3: Encode the labels (convert 'benign' and 'malicious' to 0s and 1s)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['status'])  # Convert 'benign'/'malicious' to 0/1

# Step 4: Text feature extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=500)  # Limit to 500 features
X = vectorizer.fit_transform(data['url']).toarray()

print(f"TF-IDF transformation complete. Feature shape: {X.shape}")

# Step 5: Split into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Splitting Complete:")
print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

# Step 6: Save the preprocessed data
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

print("Preprocessing Done! Files saved as X_train.npy, X_test.npy, y_train.npy, y_test.npy.")

File loaded successfully as CSV!
                                                 url     status
0                                   br-icloud.com.br  malicious
1                mp3raid.com/music/krizz_kaliko.html     benign
2                    bopsecrets.org/rexroth/cr/1.htm     benign
3  http://www.garage-pirenne.be/index.php?option=...  malicious
4  http://adventure-nicaragua.net/index.php?optio...  malicious
Dataset after dropping missing values: (651191, 2)
TF-IDF transformation complete. Feature shape: (651191, 500)
Data Splitting Complete:
Training Data: (520952, 500), Testing Data: (130239, 500)
Preprocessing Done! Files saved as X_train.npy, X_test.npy, y_train.npy, y_test.npy.


In [None]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Load the preprocessed data
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

print("Preprocessed data loaded successfully.")
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

# Step 2: Reshape the input for GRU
# GRU expects 3D input: (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Step 3: Build the GRU model
model = Sequential([
    GRU(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False),
    Dropout(0.3),  # Prevent overfitting
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification output
])

# Step 4: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the GRU model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

print("Training GRU model...")
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[early_stopping],
    verbose=1
)

# Step 6: Evaluate the model
print("\nEvaluating the model on test data...")
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Preprocessed data loaded successfully.
Training data shape: (520952, 500), Testing data shape: (130239, 500)


  super().__init__(**kwargs)


Training GRU model...
Epoch 1/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 7ms/step - accuracy: 0.8996 - loss: 0.2507 - val_accuracy: 0.9333 - val_loss: 0.1791
Epoch 2/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 7ms/step - accuracy: 0.9352 - loss: 0.1775 - val_accuracy: 0.9401 - val_loss: 0.1677
Epoch 3/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 7ms/step - accuracy: 0.9397 - loss: 0.1673 - val_accuracy: 0.9422 - val_loss: 0.1620
Epoch 4/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - accuracy: 0.9416 - loss: 0.1619 - val_accuracy: 0.9435 - val_loss: 0.1578
Epoch 5/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 7ms/step - accuracy: 0.9439 - loss: 0.1564 - val_accuracy: 0.9448 - val_loss: 0.1544
Epoch 6/20
[1m6512/6512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 7ms/step - accuracy: 0.9438 - loss: 0.1541 - val_accuracy: 0.9450 - val_l