#### Koalas
Takes Pandas API and implements it over/on Spark/PySpark. 
So instead of running into impedence mismatch between somewhat similar but actually different Pandas API vs Spark API - you can use the Pandas API/syntax in both environments (i.e. single node Pandas vs multi node capable Spark). Obviously helpful for people familiar with Pandas and trying to employ Spark for bigger data.
https://koalas.readthedocs.io/en/latest/getting_started/10min.html

Run the following on the Linux command line<br>
pip install --user --upgrade pandas<br>
pip install --user koalas

In [1]:
# run pip install koalas on linux command line
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

In [2]:
import databricks.koalas as ks



In [3]:
s = ks.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
Name: 0, dtype: float64

In [4]:
kdf = ks.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

In [5]:
kdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [6]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2013-01-01,0.641246,0.403853,0.393165,-0.189162
2013-01-02,0.316661,-0.799478,0.385739,-0.679841
2013-01-03,-0.072453,-0.000507,-0.27289,-0.90891
2013-01-04,-2.295349,-0.662693,0.191108,-0.239709
2013-01-05,0.259339,0.324247,0.388398,1.5566
2013-01-06,0.211928,0.447103,1.217767,0.502284


In [8]:
kdf = ks.from_pandas(pdf)
type(kdf)

databricks.koalas.frame.DataFrame

In [9]:
spark = SparkSession.builder.getOrCreate()

In [10]:
sdf = spark.createDataFrame(pdf)
sdf.show()

+--------------------+--------------------+-------------------+-------------------+
|                   A|                   B|                  C|                  D|
+--------------------+--------------------+-------------------+-------------------+
|  0.6412460064358668| 0.40385266339926373| 0.3931650005305172|-0.1891615198225296|
| 0.31666098305680546| -0.7994783831598772| 0.3857389187905075|-0.6798406130017511|
|-0.07245303355092421|-5.07239010118462...| -0.272890133906403|-0.9089104922159664|
| -2.2953490300059434| -0.6626929451419697| 0.1911081902069158|-0.2397087265115115|
|  0.2593387907761005| 0.32424678637644333|0.38839768020322146| 1.5566001790927049|
| 0.21192775749369225| 0.44710323692141163| 1.2177673433835616| 0.5022843176466537|
+--------------------+--------------------+-------------------+-------------------+



In [11]:
kdf = sdf.to_koalas()
kdf

Unnamed: 0,A,B,C,D
0,0.641246,0.403853,0.393165,-0.189162
1,0.316661,-0.799478,0.385739,-0.679841
2,-0.072453,-0.000507,-0.27289,-0.90891
3,-2.295349,-0.662693,0.191108,-0.239709
4,0.259339,0.324247,0.388398,1.5566
5,0.211928,0.447103,1.217767,0.502284


In [12]:
kdf.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [13]:
kdf.head(10)

Unnamed: 0,A,B,C,D
0,0.641246,0.403853,0.393165,-0.189162
1,0.316661,-0.799478,0.385739,-0.679841
2,-0.072453,-0.000507,-0.27289,-0.90891
3,-2.295349,-0.662693,0.191108,-0.239709
4,0.259339,0.324247,0.388398,1.5566
5,0.211928,0.447103,1.217767,0.502284


In [14]:
kdf.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [15]:
kdf.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
kdf.to_numpy()

array([[ 6.41246006e-01,  4.03852663e-01,  3.93165001e-01,
        -1.89161520e-01],
       [ 3.16660983e-01, -7.99478383e-01,  3.85738919e-01,
        -6.79840613e-01],
       [-7.24530336e-02, -5.07239010e-04, -2.72890134e-01,
        -9.08910492e-01],
       [-2.29534903e+00, -6.62692945e-01,  1.91108190e-01,
        -2.39708727e-01],
       [ 2.59338791e-01,  3.24246786e-01,  3.88397680e-01,
         1.55660018e+00],
       [ 2.11927757e-01,  4.47103237e-01,  1.21776734e+00,
         5.02284318e-01]])

In [17]:
kdf.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.156438,-0.047913,0.383881,0.006877
std,1.072444,0.553658,0.482491,0.900728
min,-2.295349,-0.799478,-0.27289,-0.90891
25%,-0.072453,-0.662693,0.191108,-0.679841
50%,0.211928,-0.000507,0.385739,-0.239709
75%,0.316661,0.403853,0.393165,0.502284
max,0.641246,0.447103,1.217767,1.5566


In [18]:
kdf.T

Unnamed: 0,0,1,2,3,4,5
A,0.641246,0.316661,-0.072453,-2.295349,0.259339,0.211928
B,0.403853,-0.799478,-0.000507,-0.662693,0.324247,0.447103
C,0.393165,0.385739,-0.27289,0.191108,0.388398,1.217767
D,-0.189162,-0.679841,-0.90891,-0.239709,1.5566,0.502284
