# Spark and Friends

## Reading data

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_sp = spark.read.csv("../data/sample_stocks.csv", header=True)

In [2]:
from optimus import Optimus
op = Optimus()
df_op = op.load.csv("https://raw.githubusercontent.com/databricks/koalas/master/data/sample_stocks.csv")

In [None]:
import databricks.koalas as ks

df = ks.read_csv("https://raw.githubusercontent.com/databricks/koalas/master/data/sample_stocks.csv") # error

In [None]:
import pandas as pd 

df_pd = pd.read_csv("https://raw.githubusercontent.com/databricks/koalas/master/data/sample_stocks.csv")

In [None]:
import databricks.koalas as ks

df_ks = ks.read_csv("../data/sample_stocks.csv")

In [None]:
from optimus import Optimus
op = Optimus()
df_op_local = op.load.csv("../data/sample_stocks.csv")

In [None]:
print(type(df_sp))
print(type(df_op))
print(type(df_pd))
print(type(df_ks))

In [None]:
df_sp.show(1)

In [None]:
df_op.show(1)

In [None]:
df_pd.head(1)

In [None]:
df_ks.show()

In [None]:
df_ks.head(1)

In [None]:
df_op.table(1)

## Selecting data

### With Spark

In [None]:
%%time
df_sp["Date","Open","High","Volume"].show(1)

In [None]:
%%time
df_sp.select("Date","Open","High","Volume").show(1)

### With Optimus

In [None]:
%%time
df_op["Date","Open","High","Volume"].table(1)

In [None]:
%%time
df_op.select("Date","Open","High","Volume").table(1)

In [None]:
df_op.cols.select([0,1,2,5]).table(1)

### With Pandas

In [None]:
%%time
df_pd[["Date","Open","High","Volume"]].head(1)

In [None]:
%%time
df_pd.iloc[:, [0,1,2,4]].head(1)

### With Koalas

In [None]:
%%time
df_ks[["Date","Open","High","Volume"]].head(1)

In [None]:
df_ks.iloc[:, [0,1,2,4]].head(1) # will fail

In [None]:
df_ks.select("Date","Open","High","Volume") # Will fail

## More advance stuff

In [None]:
%%time
# Pandas
df_pd["Symbol"].value_counts()

In [None]:
%%time
# Koalas
df_ks["Symbol"].value_counts()

In [None]:
%%time
# Spark
df_sp.groupBy('Symbol').count().show()

In [None]:
%%time
# Optimus
df_op.groupBy('Symbol').count().show()

In [None]:
%%time
# Optimus
df_op.cols.frequency("Symbol")["Symbol"]

### One-Hot-Encoding

In [None]:
%%time
pd.get_dummies(data=df_pd, columns=["Symbol"]).head(1) # This is crazy easy

In [None]:
%%time
ks.get_dummies(data=df_ks, columns=["Symbol"]).head(1) # This is crazy easy too

In [None]:
%%time
# I hate this
from pyspark.ml.feature import StringIndexer,OneHotEncoderEstimator

indexer = StringIndexer(inputCol="Symbol", outputCol="SymbolIndex")
df_sp_indexed = indexer.fit(df_sp).transform(df_sp)

encoder = OneHotEncoderEstimator(inputCols=["SymbolIndex"],
                                 outputCols=["SymbolVec"])

model = encoder.fit(df_sp_indexed)
df_sp_encoded = model.transform(df_sp_indexed)
df_sp_encoded.show(1)

In [None]:
%%time
from optimus.ml.feature import string_to_index, one_hot_encoder

df_sp_indexed = string_to_index(df_sp, input_cols="Symbol")
df_sp_encoded = one_hot_encoder(df_sp_indexed, "Symbol***INDEX_TO_STRING")
df_sp_encoded.show(1)

## Plots

### Pandas

In [None]:
df_pd.plot.scatter("Open","Volume")

In [None]:
df_pd.boxplot("High")

In [None]:
df_pd.hist("Low")

### Koalas

In [None]:
df_ks.hist("Low")

### Spark

In [None]:
# ...

### Optimus

In [None]:
df_op.plot.box("High")

In [None]:
df_op.plot.hist("Low")

In [None]:
df_op.plot.scatter(["Open","Volume"])