# Importing Libraries

In [89]:
import pandas as pd

# Importing Data

In [105]:
data = pd.read_csv('data/apache/data.csv')
data.head()

Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Referrer,User Agent,Time Taken
0,56.81.94.161,[24/Nov/2019:03:03:30 +0300],GET,/usr/admin,502,5014,https://www.hill.com/main/posts/categoriescategory.html,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",507
1,117.144.149.150,[09/Mar/2018:03:07:53 +0300],POST,/usr/login,500,5012,-,"Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.116 Mobile Safari/537.36 EdgA/45.12.4.5121",151
2,107.193.107.50,[14/Jun/2019:01:16:10 +0300],DELETE,/usr/admin/developer,200,5002,https://www.hill.com/main/posts/categoriescategory.html,"Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36 OPR/61.2.3076.56749",2085
3,149.25.240.57,[02/Jan/2019:03:14:05 +0300],PUT,/usr/admin/developer,200,4905,https://www.hill.com/main/posts/categoriescategory.html,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4380.0 Safari/537.36 Edg/89.0.759.0",4896
4,38.186.148.29,[29/Apr/2019:12:24:51 +0300],DELETE,/usr/register,502,4953,https://www.hill.com/main/posts/categoriescategory.html,"Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36",3236


In [106]:
data.describe()

Unnamed: 0,Status Code,Response Size,Time Taken
count,1000000.0,1000000.0,1000000.0
mean,373.747231,4999.478885,2500.162185
std,102.920869,49.98465,1443.310076
min,200.0,4763.0,1.0
25%,303.0,4966.0,1249.0
50%,403.0,4999.0,2501.0
75%,500.0,5033.0,3749.0
max,502.0,5230.0,5000.0


In [107]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   IP Address     1000000 non-null  object
 1   Date           1000000 non-null  object
 2   Request        1000000 non-null  object
 3   Endpoint       1000000 non-null  object
 4   Status Code    1000000 non-null  int64 
 5   Response Size  1000000 non-null  int64 
 6   Referrer       1000000 non-null  object
 7   User Agent     1000000 non-null  object
 8   Time Taken     1000000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 68.7+ MB


# Data Preprocessing

In [108]:
# Date sütununu datetime formatına dönüştürme
data['Date'] = pd.to_datetime(data['Date'], format='[%d/%b/%Y:%H:%M:%S %z]')

# Dönüşümün başarılı olup olmadığını kontrol etmek için ilk birkaç satıra bakalım
data['Date'].head()

0   2019-11-24 03:03:30+03:00
1   2018-03-09 03:07:53+03:00
2   2019-06-14 01:16:10+03:00
3   2019-01-02 03:14:05+03:00
4   2019-04-29 12:24:51+03:00
Name: Date, dtype: datetime64[ns, UTC+03:00]

In [109]:
# Eksik verilerin kontrolü
missing_values = data.isnull().sum()
print(missing_values)

# Eksik veri oranlarını da görebilmek için
missing_percent = (missing_values / len(data)) * 100
print(missing_percent)


IP Address       0
Date             0
Request          0
Endpoint         0
Status Code      0
Response Size    0
Referrer         0
User Agent       0
Time Taken       0
dtype: int64
IP Address       0.0
Date             0.0
Request          0.0
Endpoint         0.0
Status Code      0.0
Response Size    0.0
Referrer         0.0
User Agent       0.0
Time Taken       0.0
dtype: float64


In [110]:
# Gereksiz sütunları kaldırma
data = data.drop(columns=['Referrer', 'User Agent'])

# Kaldırma işleminden sonra veri setinin ilk birkaç satırına bakalım
data.head()


Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Time Taken
0,56.81.94.161,2019-11-24 03:03:30+03:00,GET,/usr/admin,502,5014,507
1,117.144.149.150,2018-03-09 03:07:53+03:00,POST,/usr/login,500,5012,151
2,107.193.107.50,2019-06-14 01:16:10+03:00,DELETE,/usr/admin/developer,200,5002,2085
3,149.25.240.57,2019-01-02 03:14:05+03:00,PUT,/usr/admin/developer,200,4905,4896
4,38.186.148.29,2019-04-29 12:24:51+03:00,DELETE,/usr/register,502,4953,3236


In [111]:
# Yeni özellikler ekleme: Yıl, Ay, Gün, Saat, Dakika, vs.
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour
data['Minute'] = data['Date'].dt.minute

# Yeni özellikleri kontrol edelim
data.head()

Unnamed: 0,IP Address,Date,Request,Endpoint,Status Code,Response Size,Time Taken,Year,Month,Day,Hour,Minute
0,56.81.94.161,2019-11-24 03:03:30+03:00,GET,/usr/admin,502,5014,507,2019,11,24,3,3
1,117.144.149.150,2018-03-09 03:07:53+03:00,POST,/usr/login,500,5012,151,2018,3,9,3,7
2,107.193.107.50,2019-06-14 01:16:10+03:00,DELETE,/usr/admin/developer,200,5002,2085,2019,6,14,1,16
3,149.25.240.57,2019-01-02 03:14:05+03:00,PUT,/usr/admin/developer,200,4905,4896,2019,1,2,3,14
4,38.186.148.29,2019-04-29 12:24:51+03:00,DELETE,/usr/register,502,4953,3236,2019,4,29,12,24


In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Endpoint ve Request sütunlarını metin verisi olarak kullanacağız
texts = data['Endpoint'] + ' ' + data['Request']

# TF-IDF vektörizeri oluşturma ve veriyi dönüştürme
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# X to numpy array
X = X.toarray()
X

array([[0.59466222, 0.        , 0.        , ..., 0.        , 0.        ,
        0.31052168],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.27205541],
       [0.46266636, 0.57545841, 0.62961805, ..., 0.        , 0.        ,
        0.24159587],
       ...,
       [0.        , 0.64855267, 0.        , ..., 0.        , 0.        ,
        0.27228318],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3862327 ],
       [0.46236156, 0.        , 0.62920328, ..., 0.        , 0.        ,
        0.24143671]])

Faiss

In [117]:
import numpy as np

# X'in türünü ve veri tipini kontrol edin
print(type(X))  # Bu 'numpy.ndarray' olmalı
print(X.dtype)  # Bu 'float32' olmalı

# X'i 'float32' veri tipine dönüştürün
X = np.array(X, dtype=np.float32)

# X'in ilk birkaç elemanını kontrol edin
print(X[:5])


<class 'numpy.ndarray'>
float32
[[0.59466225 0.         0.         0.74158823 0.         0.
  0.         0.         0.3105217 ]
 [0.         0.         0.         0.         0.71020865 0.64929926
  0.         0.         0.27205542]
 [0.46266636 0.5754584  0.62961805 0.         0.         0.
  0.         0.         0.24159586]
 [0.46224666 0.         0.6290469  0.         0.         0.
  0.5765113  0.         0.24137671]
 [0.         0.6483666  0.         0.         0.         0.
  0.         0.71100295 0.27220505]]
