In [1]:
from optimus import Optimus
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
op = Optimus()

# Create sample data

In [3]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}

# Create Spark DF

In [6]:
df_pd = pd.DataFrame(raw_data)

In [9]:
regiment = op.spark.createDataFrame(df_pd)

In [10]:
regiment.table()

regiment  1 (string)  nullable,company  2 (string)  nullable,name  3 (string)  nullable,preTestScore  4 (bigint)  nullable,postTestScore  5 (bigint)  nullable
Nighthawks,1st,Miller,4,25
Nighthawks,1st,Jacobson,24,94
Nighthawks,2nd,Ali,31,57
Nighthawks,2nd,Milner,2,62
Dragoons,1st,Cooze,3,70
Dragoons,1st,Jacon,4,25
Dragoons,2nd,Ryaner,24,94
Dragoons,2nd,Sone,31,57
Scouts,1st,Sloan,2,62
Scouts,1st,Piger,3,70


# What is the mean preTestScore from the regiment Nighthawks?

In [13]:
regiment.where(regiment.regiment == "Nighthawks").groupBy("regiment").mean().table()

regiment  1 (string)  nullable,avg(preTestScore)  2 (double)  nullable,avg(postTestScore)  3 (double)  nullable
Nighthawks,15.25,59.5


# Present general statistics by company

In [37]:
# Maybe not the cleanest solution but it works :)
regiment.groupby('company').agg(count("preTestScore"),
                                count("postTestScore"), 
                               mean("preTestScore"), 
                               mean("postTestScore"),
                               stddev("preTestScore"), 
                               stddev("postTestScore")).table()

company  1 (string)  nullable,count(preTestScore)  2 (bigint),count(postTestScore)  3 (bigint),avg(preTestScore)  4 (double)  nullable,avg(postTestScore)  5 (double)  nullable,stddev_samp(preTestScore)  6 (double)  nullable,stddev_samp(postTestScore)  7 (double)  nullable
2nd,6,6,15.5,67.0,14.652644812456211,14.057026712644465
1st,6,6,6.666666666666667,57.66666666666666,8.524474568362947,27.48575388572536


In [38]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf("double", PandasUDFType.GROUPED_AGG)
def mean_udf(v):
    return v.mean()

# What is the mean each company's preTestScore?

In [46]:
regiment.groupby('company').mean("preTestScore").table()

company  1 (string)  nullable,avg(preTestScore)  2 (double)  nullable
2nd,15.5
1st,6.666666666666667


# Present the mean preTestScores grouped by regiment and company

In [47]:
regiment.groupby(["regiment","company"]).mean("preTestScore").table()

regiment  1 (string)  nullable,company  2 (string)  nullable,avg(preTestScore)  3 (double)  nullable
Nighthawks,1st,14.0
Dragoons,1st,3.5
Nighthawks,2nd,16.5
Dragoons,2nd,27.5
Scouts,2nd,2.5
Scouts,1st,2.5


# Present the mean preTestScores grouped by regiment and company without heirarchical indexing

In [59]:
# Not great, not terrible
regiment.groupby(["regiment","company"]).pivot("company").mean("preTestScore").drop("company").table()

regiment  1 (string)  nullable,1st  2 (double)  nullable,2nd  3 (double)  nullable
Nighthawks,14.0,
Dragoons,3.5,
Nighthawks,,16.5
Dragoons,,27.5
Scouts,,2.5
Scouts,2.5,


# Group the entire dataframe by regiment and company

In [61]:
regiment.groupby(["regiment","company"]).mean().table()

regiment  1 (string)  nullable,company  2 (string)  nullable,avg(preTestScore)  3 (double)  nullable,avg(postTestScore)  4 (double)  nullable
Nighthawks,1st,14.0,59.5
Dragoons,1st,3.5,47.5
Nighthawks,2nd,16.5,59.5
Dragoons,2nd,27.5,75.5
Scouts,2nd,2.5,66.0
Scouts,1st,2.5,66.0


# What is the number of observations in each regiment and company

In [62]:
regiment.groupby(['company', 'regiment']).count().table()

company  1 (string)  nullable,regiment  2 (string)  nullable,count  3 (bigint)
1st,Dragoons,2
1st,Scouts,2
2nd,Nighthawks,2
2nd,Dragoons,2
1st,Nighthawks,2
2nd,Scouts,2


# Iterate over a group and print the name and the whole data from the regiment

In [68]:
# Get unique values in the grouping column
groups = [x[0] for x in regiment.select("regiment").distinct().collect()]

# Create a filtered DataFrame for each group in a list comprehension
groups_list = [regiment.filter(col('regiment')==x) for x in groups]

# show the results
[x.show() for x in groups_list]

+----------+-------+--------+------------+-------------+
|  regiment|company|    name|preTestScore|postTestScore|
+----------+-------+--------+------------+-------------+
|Nighthawks|    1st|  Miller|           4|           25|
|Nighthawks|    1st|Jacobson|          24|           94|
|Nighthawks|    2nd|     Ali|          31|           57|
|Nighthawks|    2nd|  Milner|           2|           62|
+----------+-------+--------+------------+-------------+

+--------+-------+------+------------+-------------+
|regiment|company|  name|preTestScore|postTestScore|
+--------+-------+------+------------+-------------+
|Dragoons|    1st| Cooze|           3|           70|
|Dragoons|    1st| Jacon|           4|           25|
|Dragoons|    2nd|Ryaner|          24|           94|
|Dragoons|    2nd|  Sone|          31|           57|
+--------+-------+------+------------+-------------+

+--------+-------+-----+------------+-------------+
|regiment|company| name|preTestScore|postTestScore|
+--------+----

[None, None, None]