# IP Mapping

In [None]:
#####################
# Google Colab ONLY #
#####################

# Install Packages
!pip install -r https://raw.githubusercontent.com/Flenz42/HSD-DAW-Pandas-SS23/master/requirements.txt
# Download custom scripts
!wget !wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/parse_functions.py
!wget !wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/helpers.py
# Get Testdata
!wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/data/acces_log.csv -P data
!wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/data/ip_mapping.csv -P data
!wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/geo-city/GeoLite2-City-Blocks-IPv4.csv -P data/geo-city
!wget https://eu2.contabostorage.com/327e15c0c21f41889d32882e7beb3e92:hsd-daw-ss23-share/geo-city/GeoLite2-City-Locations-de.csv -P data/geo-city

In [None]:
import pandas as pd
import ipcalc
import parse_functions as pf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import user_agents as ua
from helpers import filter_other, autopct_format
import calendar
import janitor
import pytz

## Daten

### 1. Access Log

In [None]:
# Daten aus CSV laden
filename = "data/acces_log.csv"
access_log = pd.read_csv(filename)

### 2. IP-Listen und Ortsinformationen

In [None]:
city_ip4 = pd.read_csv('data/geo-city/GeoLite2-City-Blocks-IPv4.csv')
city_ip4.head()
locations = pd.read_csv('data/geo-city/GeoLite2-City-Locations-de.csv')
locations.head()

## Daten vorbereiten

### IP-Adressen in `int` konvertieren

In [None]:
access_log['ip_n'] = pf.ip_to_int(access_log['ip'])
access_log.head()

### IP-Liste aufbereiten

- Netzwerke aus dem Format `1.0.0.0/24` berechnen
- "Rand-Adressen" merken

In [None]:
city_ip4['ip_range'] = city_ip4['network'].map(lambda network: ipcalc.Network(network))
city_ip4['ip_first'] = city_ip4['ip_range'].map(lambda ip: int(ip[0])).astype('int32')
city_ip4['ip_last'] = city_ip4['ip_range'].map(lambda ip: int(ip[-1])).astype('int32')

city_ip4.head()

## IP-Mapping

- Liste der IP-Adressen im Log auslesen
- Jede IP-Adresse mit einem `Network` joinen

In [None]:
# Achtung: Das dauert ca. 8 Minuten
ips = access_log['ip_n'].unique()
ip_map = pd.DataFrame(ips, columns=['ip_n'])
ip_map = ip_map.conditional_join(city_ip4, ('ip_n', 'ip_first', '>='), ('ip_n', 'ip_last', '<='))

## Zeit sparen

In [None]:
# Zwischenstand Als CSV speichern
filename = "data/ip_mapping.csv"
ip_map.to_csv(filename, index=False)

In [None]:
# Laden aus CSV (dauert keine 6 Minuten)
filename = "data/ip_mapping.csv"
ip_map = pd.read_csv(filename)
ip_map.head()

## IP-Netze auf Geo-Daten mappen

In [None]:
ip_map['geoname_id'] = ip_map['geoname_id'].astype('Int64')
locations['geoname_id'] = locations['geoname_id'].astype('Int64')
ip_locations = ip_map.merge(locations, how='left', on='geoname_id', suffixes=(None, '_i'))
ip_locations.head()

## Logs und IP-Daten verbinden

In [None]:
access_log['ip_n'] = access_log['ip_n'].astype('int32')
ip_locations['ip_n'] = ip_locations['ip_n'].astype('int32')
log_with_locations = access_log.merge(ip_locations, how='left', on='ip_n', suffixes=(None, '_i'))
log_with_locations.head()

## Analyse: Länder

In [None]:
country_counts = log_with_locations['country_name'].value_counts()

country_counts_filtered = filter_other(country_counts, 0.01)

country_counts_filtered.plot(kind='pie', title='Countries', ylabel='Country', autopct=autopct_format(country_counts_filtered))