In [1]:
import pandas as pd

# Veriyi yükleme
data = pd.read_csv('data/apache/data.csv')
print(data.head())
print(data.describe())
print(data.info())


        IP Address                          Date Request  \
0     56.81.94.161  [24/Nov/2019:03:03:30 +0300]     GET   
1  117.144.149.150  [09/Mar/2018:03:07:53 +0300]    POST   
2   107.193.107.50  [14/Jun/2019:01:16:10 +0300]  DELETE   
3    149.25.240.57  [02/Jan/2019:03:14:05 +0300]     PUT   
4    38.186.148.29  [29/Apr/2019:12:24:51 +0300]  DELETE   

               Endpoint  Status Code  Response Size  \
0            /usr/admin          502           5014   
1            /usr/login          500           5012   
2  /usr/admin/developer          200           5002   
3  /usr/admin/developer          200           4905   
4         /usr/register          502           4953   

                                            Referrer  \
0  https://www.hill.com/main/posts/categoriescate...   
1                                                  -   
2  https://www.hill.com/main/posts/categoriescate...   
3  https://www.hill.com/main/posts/categoriescate...   
4  https://www.hill.com/main

In [2]:
# Date sütununu datetime formatına dönüştürme
data['Date'] = pd.to_datetime(data['Date'], format='[%d/%b/%Y:%H:%M:%S %z]')
print(data['Date'].head())

# Eksik verilerin kontrolü
missing_values = data.isnull().sum()
print(missing_values)
missing_percent = (missing_values / len(data)) * 100
print(missing_percent)

# Gereksiz sütunları kaldırma
data = data.drop(columns=['Referrer', 'User Agent'])
print(data.head())


0   2019-11-24 03:03:30+03:00
1   2018-03-09 03:07:53+03:00
2   2019-06-14 01:16:10+03:00
3   2019-01-02 03:14:05+03:00
4   2019-04-29 12:24:51+03:00
Name: Date, dtype: datetime64[ns, UTC+03:00]
IP Address       0
Date             0
Request          0
Endpoint         0
Status Code      0
Response Size    0
Referrer         0
User Agent       0
Time Taken       0
dtype: int64
IP Address       0.0
Date             0.0
Request          0.0
Endpoint         0.0
Status Code      0.0
Response Size    0.0
Referrer         0.0
User Agent       0.0
Time Taken       0.0
dtype: float64
        IP Address                      Date Request              Endpoint  \
0     56.81.94.161 2019-11-24 03:03:30+03:00     GET            /usr/admin   
1  117.144.149.150 2018-03-09 03:07:53+03:00    POST            /usr/login   
2   107.193.107.50 2019-06-14 01:16:10+03:00  DELETE  /usr/admin/developer   
3    149.25.240.57 2019-01-02 03:14:05+03:00     PUT  /usr/admin/developer   
4    38.186.148.29 2019-04-

In [3]:
# Yeni özellikler ekleme: Yıl, Ay, Gün, Saat, Dakika, vs.
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour
data['Minute'] = data['Date'].dt.minute
print(data.head())


        IP Address                      Date Request              Endpoint  \
0     56.81.94.161 2019-11-24 03:03:30+03:00     GET            /usr/admin   
1  117.144.149.150 2018-03-09 03:07:53+03:00    POST            /usr/login   
2   107.193.107.50 2019-06-14 01:16:10+03:00  DELETE  /usr/admin/developer   
3    149.25.240.57 2019-01-02 03:14:05+03:00     PUT  /usr/admin/developer   
4    38.186.148.29 2019-04-29 12:24:51+03:00  DELETE         /usr/register   

   Status Code  Response Size  Time Taken  Year  Month  Day  Hour  Minute  
0          502           5014         507  2019     11   24     3       3  
1          500           5012         151  2018      3    9     3       7  
2          200           5002        2085  2019      6   14     1      16  
3          200           4905        4896  2019      1    2     3      14  
4          502           4953        3236  2019      4   29    12      24  


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vektörizeri oluşturma ve veriyi dönüştürme
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Endpoint'] + ' ' + data['Request']).toarray()


In [5]:
import faiss
import numpy as np

# FAISS indeksi oluşturma
index = faiss.IndexFlatL2(X.shape[1])  # Vektör boyutunu belirt
index.add(X.astype('float32'))  # Vektörleri indekse ekleme


In [6]:
# Kullanıcı sorgusunu vektörleştirme
query = "24 Kasım 2019 tarihinde hangi sayfalara erişildi?"
query_vector = vectorizer.transform([query]).toarray().astype('float32')

# En yakın komşuları bulma
D, I = index.search(query_vector, 5)  # 5 en yakın komşu
relevant_logs = data.iloc[I[0]]
print(relevant_logs)


        IP Address                      Date Request              Endpoint  \
2   107.193.107.50 2019-06-14 01:16:10+03:00  DELETE  /usr/admin/developer   
7       6.43.6.180 2019-07-14 07:00:34+03:00     PUT         /usr/register   
9    114.35.185.85 2019-10-02 05:04:21+03:00  DELETE            /usr/login   
12    2.131.48.139 2018-05-21 10:35:44+03:00     GET         /usr/register   
14   6.197.102.230 2019-02-24 06:43:27+03:00  DELETE            /usr/login   

    Status Code  Response Size  Time Taken  Year  Month  Day  Hour  Minute  
2           200           5002        2085  2019      6   14     1      16  
7           500           4904        3554  2019      7   14     7       0  
9           304           4954        1611  2019     10    2     5       4  
12          304           4988        1159  2018      5   21    10      35  
14          502           4973        4449  2019      2   24     6      43  


In [11]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Model ve tokenizer yükleme
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Log kayıtlarını birleştirerek giriş metni oluşturma
input_text = " ".join(relevant_logs['Request'].tolist())
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# attention_mask oluşturma
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Yanıt oluşturma
output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


DELETE PUT DELETE GET DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DELETE PUT DE
