# Theory

- data lake
  - raw data
  - easy access
  - great for data science
  - no defined purpose (yet)
  - processing/conversion only done once data is needed
- data warehouse
  - processed clean/high-quality data
  - harder to make changes
- lazy evaluation for saving resources

# Timing

In [None]:
import datetime


start = datetime.datetime.now()

# operation

seconds_taken = (datetime.datetime.now() - start).total_seconds()
print(f"{seconds_taken}s")


# key-value-store
- key can be any string
- value can be any string
- update not directly possible (requires: read, update, write)
- great for caching

### Redis

In [None]:
import redis


# setup client
r = redis.Redis(host="127.0.0.1", port=6379, db=0)


# set value
r.set("key", "value")

# get value (common python encodings: "ascii", "utf-8", "latin1")
value = r.get("key").decode("utf-8")


### publish subscribe in redis

In [None]:
import redis


# setup publish
pub = redis.Redis(host="localhost", port=6379, db=0)


# setup subscribe
sub_redis = redis.Redis(host="localhost", port=6379, db=0)
sub = sub_redis.pubsub()
sub.subscribe("sub_list")


# publish content
pub.publish("sub_list", "message")

# get subscribe message
print(sub.get_message())

# get content message
print(sub.get_message())


# document-database
- key can be any string
- value has to be a document
- searching and other operations possible on documents
- great for semi-structured data

### MongoDB

In [None]:
import pymongo
import re


# setup client
m = pymongo.MongoClient("127.0.0.1", 27017)

# select/create database
db = m["database"]

# select/create collection in database
c = db["collection"]


# set value
c.insert_one({"key1": "value1", "key2": "value2"})

# get value with condition key1 = value1
c.find_one({"key1": "value1"})

# get values with condition key1 = value1 (returns an iterator)
c.find({"key1": "value1"})


# aggregations/pipelines
pipeline=[
    {
        "$match": { "key": re.compile("regex") }
    },
    {
        "$group": {
            "_id": "$key",
            "count": { "$sum": 1 }
        }
    },
    {
        "$sort": { "count": pymongo.ASCENDING }
    }
]

# get value after applying pipeline (returns an iterator)
c.aggregate(pipeline)

# additional aggregations:
# { "$match": { "key": { "$gt": 100 } } }
# { "$count": "key" }
# { "$sortByCount": "$key" }
# { "$out": { "db": "database", "coll": "collection" } }


# get distinct keys
c.distinct("key1")

# make update
c.update_one(
    { "key1": "value1" },
    { "$set": { "key2": "value2" }}
)

# Spark

https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html

In [None]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import split, explode


# setup spark
spark = SparkSession.builder.getOrCreate()


# read text file as dataframe (column="value" each line in seperate row)
df = spark.read.text("text_file.txt")

# read csv file as dataframe infering types & reading first line as colum-headers (for tsv: sep="\t")
df = spark.read.csv("comma_seperated_values.csv", sep=",", header=True, inferSchema=True)


# split into words using java regex (every string of uninterrupted letters is considered a word)
df = df.select(split(df.value, "[^A-Za-z]+").alias("value"))

# flatten all arrays in each row (rows with empty arrays result in empty rows)
df = df.select(explode(df.value).alias("value"))

# remove empty rows
df = df.filter(df.value != "")


# print results (force evaluation of previous calls)
df.show()

# conversion to pandas
df.toPandas()
