In [43]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from data_generator import DataGenerator

In [None]:
DataGenerator.main()

In [20]:
data = pd.read_csv('data/apache/data.csv')
print("Sütun Adları:", data.columns)

Sütun Adları: Index(['IP Address', 'Date', 'Request', 'Endpoint', 'Status Code',
       'Response Size', 'Referrer', 'User Agent', 'Time Taken'],
      dtype='object')


In [21]:
data['Date'] = pd.to_datetime(data['Date'].str.strip('[]'), format='%d/%b/%Y:%H:%M:%S %z', errors='coerce')

missing_values = data.isnull().sum()
print("Eksik Veriler:", missing_values)

data = data.drop(columns=['Referrer', 'User Agent'], errors='ignore')

data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour
data['Minute'] = data['Date'].dt.minute

data['Status Code'] = pd.to_numeric(data['Status Code'], errors='coerce')

data = data.dropna(subset=['Request', 'Endpoint', 'Status Code'])
data = data[data['Request'].str.strip() != '']
data = data[data['Endpoint'].str.strip() != '']
data = data[data['Status Code'].notna()] 

data['Combined'] = data['Request'] + ' ' + data['Endpoint'] + ' ' + data['Status Code'].astype(str)

print(data['Combined'].head())
print(data['Combined'].isnull().sum())


Eksik Veriler: IP Address       0
Date             0
Request          0
Endpoint         0
Status Code      0
Response Size    0
Referrer         0
User Agent       0
Time Taken       0
dtype: int64
0       GET /usr/register 403
1          PUT /usr/login 404
2      POST /usr/register 403
3         POST /usr/admin 502
4    DELETE /usr/register 404
Name: Combined, dtype: object
0


In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Combined']).toarray()

index = faiss.IndexFlatL2(X.shape[1])
index.add(X.astype('float32'))


In [23]:

def find_relevant_logs(query):
    query_vector = vectorizer.transform([query]).toarray().astype('float32')
    D, I = index.search(query_vector, 5)
    return data.iloc[I[0]]

def generate_response(logs):
    input_text = " ".join(
        logs.apply(lambda row: f"{row['Date']} {row['Request']} {row['Endpoint']} {row['Status Code']}",
                   axis=1).tolist())
    input_text = input_text[:1000]  # Giriş metnini 1000 karakterle sınırlama

    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1,
                            pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

def answer_question(query):
    relevant_logs = find_relevant_logs(query)
    response = generate_response(relevant_logs)
    return response

In [24]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



In [25]:
# Test Sorguları Listesi
queries = [
    "Son 24 saatte hangi URL'ler 500 hatası aldı?",
    "Son bir ayda hangi IP adresleri en fazla 403 hatası aldı?",
    "Son bir yıl içinde en sık kullanılan POST isteklerinin listesi nedir?",
    "Son 30 gün içinde hangi tarayıcılar en fazla 404 hatası aldı?",
    "Son haftada hangi endpoint'ler en yüksek Response Size'a sahipti?",
    "En son 10 istekte hangi User Agent'lar kullanıldı?",
    "En yüksek zaman alımı (Time Taken) olan 5 istek nedir?",
    "Son 6 ayda hangi Referrer en çok ziyaret edildi?",
    "Son 24 saatte hangi Endpoint'lerde 502 hatası alındı?",
    "Hangi IP adresleri en uzun süre GET isteği yaptı?",
    "Which IP adress has Longest GET time ?",
]

In [26]:
# Her bir sorguyu test etme
for query in queries:
    print("-" * 80)
    print("\n")
    
    response = answer_question(query)
    print(f"Sorgu: {query}")
    print(f"Modelin Yanıtı: {response}")
    print("\n")


--------------------------------------------------------------------------------


Sorgu: Son 24 saatte hangi URL'ler 500 hatası aldı?
Modelin Yanıtı: 2018-07-20 09:02:38+03:00 PUT /usr 500 2018-10-15 05:59:40+03:00 PUT /usr 500 2019-12-23 03:40:49+03:00 PUT /usr 500 2019-01-10 09:25:46+03:00 PUT /usr 500 2018-09-07 06:15:59+03:00 PUT /usr 500 2018-08-27 19:20:46+03:00 PUT /usr 500 2018-09-07 06:15:59+03:00 PUT /usr 500 2018-09-07 06:15:59+03:00 PUT /


--------------------------------------------------------------------------------


Sorgu: Son bir ayda hangi IP adresleri en fazla 403 hatası aldı?
Modelin Yanıtı: 2019-11-16 09:04:54+03:00 PUT /usr 403 2019-01-03 03:44:33+03:00 PUT /usr 403 2019-02-25 12:01:06+03:00 PUT /usr 403 2019-07-16 10:23:53+03:00 PUT /usr 403 2018-10-30 07:34:23+03:00 PUT /usr 403 2018-10-30 07:34:23+03:00 PUT /usr 403 2018-10-30 07:34:23+03:00 PUT /usr 403 2018-10-30 07:34:23+03:00 PUT /


-----------------------------------------------------------------------