In [1]:
from pyspark.sql import SparkSession

import os
from pyspark.sql.functions import udf

spark = SparkSession \
    .builder \
    .appName("ipanon") \
    .getOrCreate()

sc = spark.sparkContext

## `hostname` anonymization

In [2]:
from faker import Faker
from pyspark.sql import Row

R = Row('content')

faker = Faker()

hostnames = [R(faker.hostname()) for i in range(100)]
df = spark.createDataFrame(hostnames)

In [3]:
df.show()

+--------------------+
|             content|
+--------------------+
|laptop-03.stephen...|
|  srv-66.flowers.com|
|   srv-84.evans.info|
|srv-55.cannon-mit...|
|     lt-22.colon.com|
|desktop-83.king-h...|
| srv-43.mendoza.info|
|   web-80.brooks.com|
|db-05.jacobs-hans...|
|    web-06.tyler.net|
| lt-60.rodriguez.com|
|    web-33.lucas.biz|
|   email-43.king.biz|
|desktop-38.christ...|
|desktop-00.wilkin...|
| email-41.parker.com|
|web-24.hernandez.org|
| laptop-35.munoz.com|
| web-11.sherman.info|
|web-22.paul-clark...|
+--------------------+
only showing top 20 rows



In [4]:
from oniony.hostname import Hostname

hostname = Hostname(sc, df, 'foo', 'bar', 10)

In [5]:
hostname.anonymized.show()

+-----------+-----------+
|     domain|       host|
+-----------+-----------+
|[B@28bbf702| [B@5bc5a9c|
|[B@12d8e36e| [B@a3064d7|
|[B@7d448174|[B@1acb5cfa|
|[B@4f5d1f5e|[B@46c2ccbe|
|[B@1fd4aa33|[B@5f3809bf|
|[B@78b85ae9|[B@284fde7c|
|[B@12707c74| [B@d1afa74|
|[B@374f862c|[B@61250287|
|[B@7027510b|[B@2a34cc21|
|[B@610141ad|[B@28f95630|
|[B@621cf52d|[B@6e45688b|
| [B@5a240c7|[B@6708ce95|
|[B@29f676e1|[B@17e07c9d|
|[B@2486c60b|[B@2aac3b3c|
|[B@307701f6|[B@246787f1|
|[B@3643bbe8|[B@7d7d13c7|
|[B@750dd52f|[B@6dcd8c70|
|[B@467af6da|[B@530c26f9|
|[B@48b3251a|[B@499c8e70|
|[B@58417dbf|[B@6a48b3a0|
+-----------+-----------+
only showing top 20 rows



## IPv4 anonymization

In [6]:
R = Row('content')

ips = [R(faker.ipv4()) for i in range(100)]
df = spark.createDataFrame(ips)

In [7]:
df.show()

+--------------+
|       content|
+--------------+
|  198.26.98.89|
|   1.8.124.108|
|192.66.135.222|
|   203.0.112.3|
|21.148.156.156|
|    192.37.7.5|
|192.31.197.224|
|  192.52.92.20|
| 115.15.83.241|
|198.51.101.127|
|  198.51.123.8|
|  26.10.70.193|
| 192.51.238.98|
|192.52.198.157|
|198.33.128.134|
| 203.0.123.235|
| 192.88.98.229|
| 198.51.126.69|
|169.204.225.98|
| 203.0.126.229|
+--------------+
only showing top 20 rows



In [8]:
from oniony.hosts import Hosts
import os

aes_key = os.urandom(32)
padding_key = os.urandom(32)

hosts = Hosts(sc, df, 'foo', 'bar', aes_key, padding_key, 10)

In [9]:
hosts.anonymized.show()

+---------------+
|        content|
+---------------+
|217.139.109.165|
| 34.136.155.147|
| 223.196.119.80|
|  208.135.238.0|
|  58.111.90.220|
|  223.165.71.34|
| 223.145.202.28|
|  223.179.36.20|
|  124.142.175.6|
|  217.189.5.126|
|  217.189.28.43|
| 52.107.198.193|
|223.181.254.100|
|223.179.198.157|
|  217.161.191.1|
|208.135.228.106|
|223.215.146.229|
| 217.189.25.162|
|  150.44.30.100|
|208.135.225.101|
+---------------+
only showing top 20 rows

