# Importing Libraries

In [1]:
import pandas as pd

# Importing Data

In [2]:
data = pd.read_csv('data/apache/data.csv')
data.head()

Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Referrer,User Agent,Time Taken
0,56.81.94.161,[24/Nov/2019:03:03:30 +0300],GET,/usr/admin,502,5014,https://www.hill.com/main/posts/categoriescate...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,507
1,117.144.149.150,[09/Mar/2018:03:07:53 +0300],POST,/usr/login,500,5012,-,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,151
2,107.193.107.50,[14/Jun/2019:01:16:10 +0300],DELETE,/usr/admin/developer,200,5002,https://www.hill.com/main/posts/categoriescate...,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,2085
3,149.25.240.57,[02/Jan/2019:03:14:05 +0300],PUT,/usr/admin/developer,200,4905,https://www.hill.com/main/posts/categoriescate...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,4896
4,38.186.148.29,[29/Apr/2019:12:24:51 +0300],DELETE,/usr/register,502,4953,https://www.hill.com/main/posts/categoriescate...,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,3236


In [3]:
data.describe()

Unnamed: 0,Status Code,Response Size,Time Taken
count,1000000.0,1000000.0,1000000.0
mean,373.747231,4999.478885,2500.162185
std,102.920869,49.98465,1443.310076
min,200.0,4763.0,1.0
25%,303.0,4966.0,1249.0
50%,403.0,4999.0,2501.0
75%,500.0,5033.0,3749.0
max,502.0,5230.0,5000.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   IP Address     1000000 non-null  object
 1   Date           1000000 non-null  object
 2   Request        1000000 non-null  object
 3   Endpoint       1000000 non-null  object
 4   Status Code    1000000 non-null  int64 
 5   Response Size  1000000 non-null  int64 
 6   Referrer       1000000 non-null  object
 7   User Agent     1000000 non-null  object
 8   Time Taken     1000000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 68.7+ MB


# Data Preprocessing

In [5]:
# Date sütununu datetime formatına dönüştürme
data['Date'] = pd.to_datetime(data['Date'], format='[%d/%b/%Y:%H:%M:%S %z]')

# Dönüşümün başarılı olup olmadığını kontrol etmek için ilk birkaç satıra bakalım
data['Date'].head()

0   2019-11-24 03:03:30+03:00
1   2018-03-09 03:07:53+03:00
2   2019-06-14 01:16:10+03:00
3   2019-01-02 03:14:05+03:00
4   2019-04-29 12:24:51+03:00
Name: Date, dtype: datetime64[ns, UTC+03:00]

In [6]:
# Eksik verilerin kontrolü
missing_values = data.isnull().sum()
print(missing_values)

# Eksik veri oranlarını da görebilmek için
missing_percent = (missing_values / len(data)) * 100
print(missing_percent)


IP Address       0
Date             0
Request          0
Endpoint         0
Status Code      0
Response Size    0
Referrer         0
User Agent       0
Time Taken       0
dtype: int64
IP Address       0.0
Date             0.0
Request          0.0
Endpoint         0.0
Status Code      0.0
Response Size    0.0
Referrer         0.0
User Agent       0.0
Time Taken       0.0
dtype: float64


In [7]:
# Gereksiz sütunları kaldırma
data = data.drop(columns=['Referrer', 'User Agent'])

# Kaldırma işleminden sonra veri setinin ilk birkaç satırına bakalım
data.head()


Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Time Taken
0,56.81.94.161,2019-11-24 03:03:30+03:00,GET,/usr/admin,502,5014,507
1,117.144.149.150,2018-03-09 03:07:53+03:00,POST,/usr/login,500,5012,151
2,107.193.107.50,2019-06-14 01:16:10+03:00,DELETE,/usr/admin/developer,200,5002,2085
3,149.25.240.57,2019-01-02 03:14:05+03:00,PUT,/usr/admin/developer,200,4905,4896
4,38.186.148.29,2019-04-29 12:24:51+03:00,DELETE,/usr/register,502,4953,3236


In [8]:
# Yeni özellikler ekleme: Yıl, Ay, Gün, Saat, Dakika, vs.
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour
data['Minute'] = data['Date'].dt.minute

# Yeni özellikleri kontrol edelim
data.head()

Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Time Taken,Year,Month,Day,Hour,Minute
0,56.81.94.161,2019-11-24 03:03:30+03:00,GET,/usr/admin,502,5014,507,2019,11,24,3,3
1,117.144.149.150,2018-03-09 03:07:53+03:00,POST,/usr/login,500,5012,151,2018,3,9,3,7
2,107.193.107.50,2019-06-14 01:16:10+03:00,DELETE,/usr/admin/developer,200,5002,2085,2019,6,14,1,16
3,149.25.240.57,2019-01-02 03:14:05+03:00,PUT,/usr/admin/developer,200,4905,4896,2019,1,2,3,14
4,38.186.148.29,2019-04-29 12:24:51+03:00,DELETE,/usr/register,502,4953,3236,2019,4,29,12,24


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vektörizeri oluşturma ve veriyi dönüştürme
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Endpoint'] + ' ' + data['Request']).toarray()


In [10]:
import faiss
import numpy as np

# FAISS indeksi oluşturma
index = faiss.IndexFlatL2(X.shape[1])  # Vektör boyutunu belirt
index.add(X.astype('float32'))  # Vektörleri indekse ekleme

In [26]:
# Kullanıcı sorgusunu vektörleştirme
query = "GET / HTTP/1.1"
query_vector = vectorizer.transform([query]).toarray().astype('float32')

In [27]:
# En yakın komşuları bulma
D, I = index.search(query_vector, 5)  # 5 en yakın komşu
relevant_logs = data.iloc[I[0]]
print(relevant_logs)


         IP Address                      Date Request Endpoint  Status Code  \
73   217.206.134.78 2018-02-11 06:07:58+03:00     GET     /usr          500   
78   126.246.88.114 2018-09-03 01:30:39+03:00     GET     /usr          403   
79     138.210.2.41 2019-11-07 11:37:03+03:00     GET     /usr          403   
128  167.168.88.166 2018-01-04 09:16:44+03:00     GET     /usr          304   
151  129.189.10.194 2019-06-07 06:58:47+03:00     GET     /usr          403   

     Response Size  Time Taken  Year  Month  Day  Hour  Minute  
73            5021        3014  2018      2   11     6       7  
78            5029        3638  2018      9    3     1      30  
79            5084        1343  2019     11    7    11      37  
128           4991        3942  2018      1    4     9      16  
151           5009        3296  2019      6    7     6      58  


In [28]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Model ve tokenizer yükleme
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Log kayıtlarını birleştirerek giriş metni oluşturma
input_text = " ".join(relevant_logs['Request'].tolist())
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# attention_mask oluşturma
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Yanıt oluşturma
output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET


In [37]:
import pandas as pd

# Veriyi yükleme
data = pd.read_csv('data/apache/data.csv')

# Veri setinin ilk birkaç satırını kontrol etme
print(data.head())

# Veri setinin genel bilgilerini kontrol etme
print(data.info())

# Eksik verilerin kontrolü
missing_values = data.isnull().sum()
print(missing_values)


        IP Address                          Date Request  \
0     56.81.94.161  [24/Nov/2019:03:03:30 +0300]     GET   
1  117.144.149.150  [09/Mar/2018:03:07:53 +0300]    POST   
2   107.193.107.50  [14/Jun/2019:01:16:10 +0300]  DELETE   
3    149.25.240.57  [02/Jan/2019:03:14:05 +0300]     PUT   
4    38.186.148.29  [29/Apr/2019:12:24:51 +0300]  DELETE   

               Endpoint  Status Code  Response Size  \
0            /usr/admin          502           5014   
1            /usr/login          500           5012   
2  /usr/admin/developer          200           5002   
3  /usr/admin/developer          200           4905   
4         /usr/register          502           4953   

                                            Referrer  \
0  https://www.hill.com/main/posts/categoriescate...   
1                                                  -   
2  https://www.hill.com/main/posts/categoriescate...   
3  https://www.hill.com/main/posts/categoriescate...   
4  https://www.hill.com/main

In [38]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Model ve tokenizer yükleme
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Log kayıtlarını birleştirerek giriş metni oluşturma
input_text = " ".join(relevant_logs['Request'].tolist())
input_text = input_text[:1000]  # Giriş metnini 1000 karakterle sınırlayarak daha anlamlı hale getirme
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# attention_mask oluşturma
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Yanıt oluşturma
output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET GET


In [39]:
# Log kayıtlarını birleştirerek giriş metni oluşturma
input_text = " ".join(relevant_logs.apply(lambda row: f"{row['Date']} {row['Request']} {row['Endpoint']} {row['Status Code']}", axis=1).tolist())
input_text = input_text[:1000]  # Giriş metnini 1000 karakterle sınırlayarak daha anlamlı hale getirme
input_ids = tokenizer.encode(input_text, return_tensors='pt')


In [42]:

def answer_question(query):
    # Kullanıcı sorgusunu vektörleştirme
    query_vector = vectorizer.transform([query]).toarray().astype('float32')

    # En yakın komşuları bulma
    D, I = index.search(query_vector, 5)
    relevant_logs = data.iloc[I[0]]

    # Log kayıtlarını birleştirerek giriş metni oluşturma
    input_text = " ".join(relevant_logs.apply(lambda row: f"{row['Date']} {row['Request']} {row['Endpoint']} {row['Status Code']}", axis=1).tolist())
    input_text = input_text[:1000]  # Giriş metnini 1000 karakterle sınırlayarak daha anlamlı hale getirme
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # attention_mask oluşturma
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    # Yanıt oluşturma
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test
query = "[31/Mar/2019:01:15:15 +0300] hangi işlemler yapıldı?"
response = answer_question(query)
print(response)



[14/Jun/2019:01:16:10 +0300] DELETE /usr/admin/developer 200 [14/Jul/2019:07:00:34 +0300] PUT /usr/register 500 [02/Oct/2019:05:04:21 +0300] DELETE /usr/login 304 [21/May/2018:10:35:44 +0300] GET /usr/register 304 [24/Feb/2019:06:43:27 +0300] DELETE /usr/login 502 [21/Jan/2018:10:35:44 +0300] GET /usr/register 502 [21/Jan/2018:10
