In [21]:
from pyspark.sql import SparkSession, Row, Column
from datetime import datetime, date
import pandas as pd
from pyspark.sql.functions import upper

In [3]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [None]:
df.show(2, vertical=True)

In [14]:
col = df.columns
print(col, "and it's type is", type(col))

['a', 'b', 'c', 'd', 'e'] and it's type is <class 'list'>


In [18]:
# DataFrame.collect() collects the distributed data to the driver side as the local data in Python. 
# Note that this can throw an out-of-memory error when the dataset is too large to fit in 
# the driver side because it collects all the data from executors to the driver side.

df.collect()

# In order to avoid throwing an out-of-memory exception, use DataFrame.take() or DataFrame.tail().

df.tail(1)
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [15]:
df.describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [19]:
# PySpark DataFrame also provides the conversion back to a pandas DataFrame to leverage pandas APIs. 
# Note that toPandas also collects all data into the driver side that can easily cause an out-of-memory-error 
# when the data is too large to fit into the driver side.

df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [23]:
type(df.c) == type(upper(df.c)) == type(df.c.isNull())

True

In [None]:
## Create new addition column
df.withColumn('upper', upper(df.c)).show()

In [26]:
# Filter the data
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



In [None]:
# PySpark supports various UDFs and APIs to allow users to execute Python native functions. See also the latest Pandas UDFs 
# and Pandas Function APIs. For instance, the example below allows users to directly use the APIs in a pandas Series within 
# Python native function.

from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_udf_one(series: pd.Series) -> pd.Series:
    return series + 1
# def pandas_udf_one(series):
#     return series + 1

df.select(pandas_udf_one(df.a)).show()

In [35]:
new_dataFrame = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [36]:
## Group dataframe by color
new_dataFrame.groupBy('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
|black|    6.0|   60.0|
| blue|    3.0|   30.0|
+-----+-------+-------+



In [40]:
data = spark.read.csv("resources-names/yob1880.txt")

In [43]:
data.describe().show()

+-------+-----+----+------------------+
|summary|  _c0| _c1|               _c2|
+-------+-----+----+------------------+
|  count| 2000|2000|              2000|
|   mean|  NaN|null|           100.742|
| stddev| null|null|466.10919893160707|
|    min|Aaron|   F|                10|
|    max| Zula|   M|               995|
+-------+-----+----+------------------+

