In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df1 = pd.read_csv("dmzo_nomal.csv", usecols=[0], nrows=30000, header=None, names=["Sentence"])
df1["Label"] = 0  # Normal veriye etiket 0

df2 = pd.read_csv("xssed.csv", usecols=[0], nrows=30000, header=None, names=["Sentence"])
df2["Label"] = 1  # XSS zararlı veriye etiket 1

# Veri kümesini birleştir ve karıştır
df = pd.concat([df1, df2], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

# Metin ve etiketleri listeye çevir
texts = df["Sentence"].astype(str).tolist()
labels = df["Label"].astype(int).tolist()  # Label'ları integer'a çevir

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding index
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')
y = np.array(labels)

# 2. Eğitim/test ayrımı
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [5]:
embed_dim = 64  # Embedding boyutu
num_heads = 2
ff_dim = 128  # Feed-forward katman boyutu

input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_dim)(input_layer)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer)
flattened = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
dense = Dense(64, activation="relu")(flattened)
dropout = Dropout(0.2)(dense)
output = Dense(1, activation="sigmoid")(dropout)

In [6]:
model = Model(inputs=input_layer, outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# 5. Eğit
model.summary()
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))

# 6. Kaydet
model.save("xss_transformer_model.h5")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 64)           4235520   
                                                                 
 transformer_block (Transfo  (None, 100, 64)           50048     
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 64)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0     

  saving_api.save_model(


In [14]:
model = tf.keras.models.load_model("xss_transformer_model.h5", custom_objects={"TransformerBlock": TransformerBlock})
input_text = "<svg src=x onerror=alert(0)>"  # Test metni
input_seq = tokenizer.texts_to_sequences([input_text])
maxlen = 100   
input_seq_padded = pad_sequences(input_seq, maxlen=maxlen, padding='post')

output = model.predict(input_seq_padded)

print(f"Predicted output: {output}")

Predicted output: [[0.9999942]]


In [16]:
import pickle

with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)