In [2]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

spark

In [6]:
import pandas as pd
import numpy as np

pandas_dataframe = pd.DataFrame(dict(n=np.arange(100), group=np.random.choice(list('abc'), 100)))

In [14]:
df = spark.createDataFrame(pandas_dataframe)
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    a|
|  1|    a|
|  2|    b|
|  3|    c|
|  4|    a|
|  5|    c|
|  6|    a|
|  7|    c|
|  8|    b|
|  9|    c|
| 10|    b|
| 11|    c|
| 12|    a|
| 13|    c|
| 14|    c|
| 15|    a|
| 16|    c|
| 17|    b|
| 18|    a|
| 19|    b|
+---+-----+
only showing top 20 rows



In [11]:
df.createOrReplaceTempView('numbers')

In [26]:
from pyspark.sql.functions import expr, avg

df.groupBy('group').agg(expr('avg(n)')).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c|46.285714285714285|
|    b| 50.72727272727273|
|    a|             51.75|
+-----+------------------+



In [28]:
df.groupBy(df.group).agg(avg(df.n)).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c|46.285714285714285|
|    b| 50.72727272727273|
|    a|             51.75|
+-----+------------------+



In [29]:
another_pandas_dataframe = spark.sql('''
SELECT group, avg(n) as mean
FROM numbers
GROUP BY group
''').show()

+-----+------------------+
|group|              mean|
+-----+------------------+
|    c|46.285714285714285|
|    b| 50.72727272727273|
|    a|             51.75|
+-----+------------------+



In [33]:
df2 = df.select('n', expr('n + 1 as incremented'))

In [36]:
df2.select(expr('incremented / 2 as x')).agg(expr('sum(x)')).explain()

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[sum(x#201)])
+- Exchange SinglePartition
   +- *(1) HashAggregate(keys=[], functions=[partial_sum(x#201)])
      +- *(1) Project [(cast((n#26L + 1) as double) / 2.0) AS x#201]
         +- Scan ExistingRDD[n#26L,group#27]


In [32]:
df.select(df.n, (df.n + 1).alias('incremented')).show()

+---+-----------+
|  n|incremented|
+---+-----------+
|  0|          1|
|  1|          2|
|  2|          3|
|  3|          4|
|  4|          5|
|  5|          6|
|  6|          7|
|  7|          8|
|  8|          9|
|  9|         10|
| 10|         11|
| 11|         12|
| 12|         13|
| 13|         14|
| 14|         15|
| 15|         16|
| 16|         17|
| 17|         18|
| 18|         19|
| 19|         20|
+---+-----------+
only showing top 20 rows



In [25]:
another_pandas_dataframe

Unnamed: 0,group,mean
0,c,46.285714
1,b,50.727273
2,a,51.75


In [41]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    a|
|  1|    a|
|  2|    b|
|  3|    c|
|  4|    a|
|  5|    c|
|  6|    a|
|  7|    c|
|  8|    b|
|  9|    c|
| 10|    b|
| 11|    c|
| 12|    a|
| 13|    c|
| 14|    c|
| 15|    a|
| 16|    c|
| 17|    b|
| 18|    a|
| 19|    b|
+---+-----+
only showing top 20 rows



In [55]:
from pyspark.sql.functions import col, column, expr

In [66]:
my_column = (col('n') + 16) * 3

df.select(my_column.alias('quantity')).show()

+--------+
|quantity|
+--------+
|      48|
|      51|
|      54|
|      57|
|      60|
|      63|
|      66|
|      69|
|      72|
|      75|
|      78|
|      81|
|      84|
|      87|
|      90|
|      93|
|      96|
|      99|
|     102|
|     105|
+--------+
only showing top 20 rows



In [79]:
df.select(((df.n + 16) * 3).alias('quantity')).select(col('quantity') + 1).show()

+--------------+
|(quantity + 1)|
+--------------+
|            49|
|            52|
|            55|
|            58|
|            61|
|            64|
|            67|
|            70|
|            73|
|            76|
|            79|
|            82|
|            85|
|            88|
|            91|
|            94|
|            97|
|           100|
|           103|
|           106|
+--------------+
only showing top 20 rows



In [81]:
df.select(expr('(n + 16) * 3 AS quantity')).show()

+--------+
|quantity|
+--------+
|      48|
|      51|
|      54|
|      57|
|      60|
|      63|
|      66|
|      69|
|      72|
|      75|
|      78|
|      81|
|      84|
|      87|
|      90|
|      93|
|      96|
|      99|
|     102|
|     105|
+--------+
only showing top 20 rows



In [83]:
df.select(df.n.cast('string'))

DataFrame[n: string]

In [86]:
from pyspark.sql.functions import sum

In [89]:
df.select(sum(df.n).alias('the_sum'), expr('avg(n) as mean'), expr('min(n) as min')).show()

+-------+----+---+
|the_sum|mean|min|
+-------+----+---+
|   4950|49.5|  0|
+-------+----+---+



In [85]:
df.select(expr('sum(n)')).show()

+------+
|sum(n)|
+------+
|  4950|
+------+

