In [1]:
!pip install kaggle



In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mssamiksha30","key":"5949774ac36aa01564a35c612ce659fc"}'}

In [3]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename('kaggle.json', '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 600)

In [4]:
!kaggle datasets download -d eliasdabbas/web-server-access-logs

Dataset URL: https://www.kaggle.com/datasets/eliasdabbas/web-server-access-logs
License(s): CC0-1.0
Downloading web-server-access-logs.zip to /content
 98% 261M/267M [00:00<00:00, 483MB/s]
100% 267M/267M [00:00<00:00, 526MB/s]


In [5]:
!unzip web-server-access-logs.zip

Archive:  web-server-access-logs.zip
  inflating: access.log              
  inflating: client_hostname.csv     


In [6]:
import re
import pandas as pd
from tqdm import tqdm

In [7]:
log_pattern = re.compile(
    r'(?P<ip>\S+) - - \[(?P<time>.*?)\] '
    r'"(?P<request>.*?)" '
    r'(?P<status>\d{3}) '
    r'(?P<size>\d+) '
    r'"(?P<referrer>.*?)" '
    r'"(?P<agent>.*?)"'
)

In [8]:
def parse_logs(file_path, max_lines=200000):
    rows = []

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for i, line in enumerate(tqdm(file)):
            if i >= max_lines:
                break

            match = log_pattern.search(line)
            if match:
                rows.append(match.groupdict())

    return pd.DataFrame(rows)

In [9]:
df = parse_logs("access.log", max_lines=200000)
df.head()

200000it [00:01, 113824.87it/s]


Unnamed: 0,ip,time,request,status,size,referrer,agent
0,54.36.149.41,22/Jan/2019:03:56:14 +0330,GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%D...,200,30577,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
1,31.56.96.51,22/Jan/2019:03:56:16 +0330,GET /image/60844/productModel/200x200 HTTP/1.1,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
2,31.56.96.51,22/Jan/2019:03:56:16 +0330,GET /image/61474/productModel/200x200 HTTP/1.1,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...
3,40.77.167.129,22/Jan/2019:03:56:17 +0330,GET /image/14925/productModel/100x100 HTTP/1.1,200,1696,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
4,91.99.72.15,22/Jan/2019:03:56:17 +0330,GET /product/31893/62100/%D8%B3%D8%B4%D9%88%D8...,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...


In [10]:
df['time'] = pd.to_datetime(
    df['time'],
    format='%d/%b/%Y:%H:%M:%S %z',
    errors='coerce'
)

df['hour'] = df['time'].dt.hour
df['minute'] = df['time'].dt.minute

In [11]:
df[['method', 'url', 'protocol']] = df['request'].str.extract(
    r'(\S+)\s(\S+)\s(\S+)'
)

In [12]:
df['status'] = df['status'].astype(int)
df['size'] = df['size'].astype(int)

In [13]:
df = df.dropna(subset=['ip', 'time', 'url'])
df = df.reset_index(drop=True)

In [14]:
df['is_bot'] = df['agent'].str.contains(
    'bot|crawl|spider|slurp',
    case=False,
    na=False
).astype(int)

In [15]:
cleaned_df = df[
    ['ip', 'time', 'hour', 'method', 'url', 'status', 'size', 'is_bot']
]

In [16]:
cleaned_df.head()

Unnamed: 0,ip,time,hour,method,url,status,size,is_bot
0,54.36.149.41,2019-01-22 03:56:14+03:30,3,GET,/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...,200,30577,1
1,31.56.96.51,2019-01-22 03:56:16+03:30,3,GET,/image/60844/productModel/200x200,200,5667,0
2,31.56.96.51,2019-01-22 03:56:16+03:30,3,GET,/image/61474/productModel/200x200,200,5379,0
3,40.77.167.129,2019-01-22 03:56:17+03:30,3,GET,/image/14925/productModel/100x100,200,1696,1
4,91.99.72.15,2019-01-22 03:56:17+03:30,3,GET,/product/31893/62100/%D8%B3%D8%B4%D9%88%D8%A7%...,200,41483,0


In [17]:
from google.colab import drive
drive.mount('/content/drive')
cleaned_df.to_csv('/content/drive/MyDrive/cleaned_logs.csv', index=False)

Mounted at /content/drive
