In [12]:
import re
import json
import requests
import pytz
import datetime
from time import sleep

def read_log_file(log_file_path):
    log_list = []
    with open(log_file_path, 'r') as file:
        for line in file:
          log_list.append(line.strip())
    return log_list

def log_segmentation(log):
    match=re.search("^(?:(?:[\w-])+\.)+(?:[\w-])+ ([0-9]+.[0-9]+.[0-9]+.[0-9]+) - - \[\d{2}/[A-Za-z]{2,3}/\d{4}:(\d{2}:\d{2}:\d{2}) \+\d+\] \"([A-Z]+) (.+ HTTP/(?:[0-9]\.)+[0-9])\" ([0-9]+) ([0-9]+) (?:\"-\" )?(?:\".+\" )?\"(.+)\"",log)
    if(match!=None):
      return match.group(1), match.group(2), match.group(3), match.group(4), match.group(5), match.group(6), match.group(7)
    else:
      match=re.search("^(?:(?:[\w-])+\.)+(?:[\w-])+ ([0-9]+.[0-9]+.[0-9]+.[0-9]+) - - \[\d{2}/[A-Za-z]{2,3}/\d{4}:(\d{2}:\d{2}:\d{2}) \+\d+\] \"(.+)*\" ([0-9]+) ([0-9]+) \"-\" \"-\"",log)
      return match.group(1), match.group(2), None, match.group(3), match.group(4), match.group(5), None

def geolocalizate_ip(ip):
    api_url="http://ip-api.com/json/"
    parametros=["country","lat","lon","timezone"]
    data={"fields":parametros}

    res=requests.get(api_url+ip, data=data)
    api_json_res=json.loads(res.content)

    country=api_json_res[parametros[0]]
    coords=(float(api_json_res[parametros[1]]),float(api_json_res[parametros[2]]))
    time_zone=api_json_res[parametros[3]]

    return country, coords, time_zone

def calculate_local_time(time, time_zone):
    time=time.split(":")

    local=pytz.timezone("Europe/Madrid")
    origin=pytz.timezone(time_zone)

    time_madrid=datetime.datetime.now(local)
    hour=time_madrid.hour

    time_origin=time_madrid.astimezone(origin)

    log_time=[int(time[0])+time_origin.hour-hour,int(time[1]),int(time[2])]

    if(log_time[0]>23):
        log_time[0]-=24
    elif(log_time[0]<0):
        log_time[0]+=24

    return (log_time[0],log_time[1],log_time[2])

# Extract features from log
def extract_features(log):
    ip, time, instruction, url, response, response_weight, browser=log_segmentation(log)
    country, coords, time_zone=geolocalizate_ip(ip)
    log_time=calculate_local_time(time, time_zone)
    return ip, country, coords, log_time, instruction, url, response, response_weight, browser

def list_of_ips(logs):
    ips=[]
    for log in logs:
        ip, time, instruction, url, response, response_weight, browser=log_segmentation(log)
        if(ip!="0"):
          ips.append(ip)
    ips=set(ips)  #To remove duplicate IP addres and to unsort the logs
    return list(ips)

logs=read_log_file(".\Logs\sitges_access.20240125.log")

In [26]:
ip_list=list_of_ips(logs)
print(len(ip_list))

4154


In [14]:
# Convert log to dictionary, with the ip as the key and the logs as the value
def log_to_dict(logs, ip_list):
    i=1
    logs_ip_dict={}
    time_zones={}
    countries={}
    coordinates={}
    for log in logs:
        ip, time, instruction, url, response, response_weight, browser=log_segmentation(log)
        if ip not in logs_ip_dict:
            country, coords, time_zone=geolocalizate_ip(ip)
            log_time=calculate_local_time(time, time_zone)
            time_zones[ip]=time_zone
            countries[ip]=country
            coordinates[ip]=coords
            i+=1
        else:
            log_time=calculate_local_time(time, time_zones[ip])
        log_dict={"country":countries[ip], "coords":coordinates[ip], "log_time":log_time, "instruction":instruction, "url":url, "response":response, "response_weight":response_weight, "browser":browser}
        if ip in logs_ip_dict:
            logs_ip_dict[ip].append(log_dict)
        else:
            logs_ip_dict[ip]=[log_dict]
    return logs_ip_dict


In [None]:
log_dict=log_to_dict(logs, ip_list)

In [25]:
with open("features_sitges_access.20240125.json","w") as f:
    json.dump(log_dict,f)