In [12]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import os

In [2]:
sc = SparkContext()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/08 18:26:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sqlContext = SQLContext(sc)



### 1. Create DataFrame

#### 1.1. Create DataFrame from RDD

In [4]:
lst_1 = [('Jisoo', 27), ('Jennie', 26), ('Rose', 24), ('Lisa', 24)]
RDD_1 = sc.parallelize(lst_1)
pair_RDD_1 = RDD_1.map(lambda x: Row(name = x[0], marks = x[1]))

In [6]:
pair_RDD_1.collect()

[Row(name='Jisoo', marks=27),
 Row(name='Jennie', marks=26),
 Row(name='Rose', marks=24),
 Row(name='Lisa', marks=24)]

In [7]:
schema_1 = sqlContext.createDataFrame(pair_RDD_1)
schema_1

DataFrame[name: string, marks: bigint]

In [8]:
type(schema_1)

pyspark.sql.dataframe.DataFrame

In [9]:
schema_1.show()

+------+-----+
|  name|marks|
+------+-----+
| Jisoo|   27|
|Jennie|   26|
|  Rose|   24|
|  Lisa|   24|
+------+-----+



#### 1.2. Create DataFrame from csv file

In [13]:
spark = SparkSession(sc)

In [17]:
csv_file_path = os.path.join('..', 'src-data', 'walmart_stock.csv')
schema_2 = spark.read.csv(csv_file_path)
schema_2.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|       _c0|               _c1|      _c2|      _c3|               _c4|     _c5|               _c6|
+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



Read with option

In [19]:
schema_3 = spark.read.option('header', True).csv(csv_file_path)
schema_3.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



You can use SparkSession --> to `createDataFrame` method

#### 1.3. create DataFrame from json file

In [20]:
json_file_path = os.path.join('..', 'src-data', 'iris.json')
schema_4 = spark.read.json(json_file_path)
schema_4.show(5)

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|              [|       null|      null|       null|      null|   null|
|           null|        1.4|       0.2|        5.1|       3.5| setosa|
|           null|        1.4|       0.2|        4.9|       3.0| setosa|
|           null|        1.3|       0.2|        4.7|       3.2| setosa|
|           null|        1.5|       0.2|        4.6|       3.1| setosa|
+---------------+-----------+----------+-----------+----------+-------+
only showing top 5 rows



### 2. DataFrame Manipulation

In [29]:
csv_file_path = os.path.join('..', 'src-data', 'walmart_stock.csv')
schema_5 = spark.read.option('header', True).csv(csv_file_path)

#### 2.1. Fundamental Manipulation

`printSchema()` method.

In [30]:
schema_5.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



`show(n)` method: display n rows.

In [31]:
schema_5.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



`count()` method: Return number of DataFrame.

In [32]:
schema_5.count()

1258

`describe()` method: show statistic data.

Show all columns in DataFrame.

In [33]:
schema_5.describe().show()

+-------+----------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|      Date|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+----------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|      1258|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean|      null| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|      null|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|2012-01-03|56.389998999999996|        57.060001|        56.299999|        56.419998|         10010500|        50.363689|
|    max|2016-12-30|         90.800003|        90.970001|            89.25|        90.4700

Just get statistics with specific comlumns.

In [34]:
schema_5.describe(['Low', 'Volume']).show()

+-------+-----------------+-----------------+
|summary|              Low|           Volume|
+-------+-----------------+-----------------+
|  count|             1258|             1258|
|   mean| 71.9186009594594|8222093.481717011|
| stddev|6.744075756255496|  4519780.8431556|
|    min|        56.299999|         10010500|
|    max|            89.25|          9994400|
+-------+-----------------+-----------------+



#### 2.2. `crosstab` method

In [41]:
json_file_path = os.path.join('..', 'src-data', 'iris.json')
schema_6 = spark.read.json(json_file_path)
schema_6.show(5)

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|              [|       null|      null|       null|      null|   null|
|           null|        1.4|       0.2|        5.1|       3.5| setosa|
|           null|        1.4|       0.2|        4.9|       3.0| setosa|
|           null|        1.3|       0.2|        4.7|       3.2| setosa|
|           null|        1.5|       0.2|        4.6|       3.1| setosa|
+---------------+-----------+----------+-----------+----------+-------+
only showing top 5 rows



In [43]:
schema_6.crosstab('petalLength', 'species').show(5)

+-------------------+----+------+----------+---------+
|petalLength_species|null|setosa|versicolor|virginica|
+-------------------+----+------+----------+---------+
|               null|   2|     0|         0|        0|
|                3.9|   0|     0|         3|        0|
|                5.0|   0|     0|         1|        3|
|                4.7|   0|     0|         5|        0|
|                3.8|   0|     0|         1|        0|
+-------------------+----+------+----------+---------+
only showing top 5 rows



#### 2.3. `groupby()` method

In [45]:
schema_6.groupby('species').mean().show()

+----------+------------------+------------------+-----------------+------------------+
|   species|  avg(petalLength)|   avg(petalWidth)| avg(sepalLength)|   avg(sepalWidth)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|             5.552|             2.026|6.587999999999998|2.9739999999999998|
|      null|              null|              null|             null|              null|
|versicolor|              4.26|1.3259999999999998|            5.936|2.7700000000000005|
|    setosa|1.4620000000000002|0.2459999999999999|5.005999999999999| 3.428000000000001|
+----------+------------------+------------------+-----------------+------------------+



In [46]:
schema_6.groupby('species').agg({'petalLength' :  'mean', 'petalWidth' : 'max'}).show()

+----------+------------------+---------------+
|   species|  avg(petalLength)|max(petalWidth)|
+----------+------------------+---------------+
| virginica|             5.552|            2.5|
|      null|              null|           null|
|versicolor|              4.26|            1.8|
|    setosa|1.4620000000000002|            0.6|
+----------+------------------+---------------+



#### 2.4. `select` method

Create sub_DataFrame

In [48]:
schema_7 = schema_6.select(['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth', 'species'])
schema_7.show(5)

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|       null|      null|       null|      null|   null|
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



#### 2.5. `distinct` method

Show unique data of one column. Use with `distinct` method

In [49]:
schema_7.select('species').distinct().show()

+----------+
|   species|
+----------+
| virginica|
|      null|
|versicolor|
|    setosa|
+----------+



#### 2.6. `orderby` method.
order ascending / descending in column.

In [50]:
schema_7.orderBy(schema_7.sepalLength.desc()).show(5)

+-----------+----------+-----------+----------+---------+
|petalLength|petalWidth|sepalLength|sepalWidth|  species|
+-----------+----------+-----------+----------+---------+
|        6.4|       2.0|        7.9|       3.8|virginica|
|        6.9|       2.3|        7.7|       2.6|virginica|
|        6.7|       2.2|        7.7|       3.8|virginica|
|        6.1|       2.3|        7.7|       3.0|virginica|
|        6.7|       2.0|        7.7|       2.8|virginica|
+-----------+----------+-----------+----------+---------+
only showing top 5 rows



In [51]:
schema_7.orderBy(schema_7.sepalLength.desc()).show(5)

+-----------+----------+-----------+----------+---------+
|petalLength|petalWidth|sepalLength|sepalWidth|  species|
+-----------+----------+-----------+----------+---------+
|        6.4|       2.0|        7.9|       3.8|virginica|
|        6.9|       2.3|        7.7|       2.6|virginica|
|        6.7|       2.2|        7.7|       3.8|virginica|
|        6.1|       2.3|        7.7|       3.0|virginica|
|        6.7|       2.0|        7.7|       2.8|virginica|
+-----------+----------+-----------+----------+---------+
only showing top 5 rows



#### 2.7. `withColumn` method

`withColumn('column_name', function)`: create new column.

In [52]:
schema_7.withColumn('total_width', schema_7.petalWidth + schema_7.sepalWidth).show(5)

+-----------+----------+-----------+----------+-------+------------------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|       total_width|
+-----------+----------+-----------+----------+-------+------------------+
|       null|      null|       null|      null|   null|              null|
|        1.4|       0.2|        5.1|       3.5| setosa|               3.7|
|        1.4|       0.2|        4.9|       3.0| setosa|               3.2|
|        1.3|       0.2|        4.7|       3.2| setosa|3.4000000000000004|
|        1.5|       0.2|        4.6|       3.1| setosa|3.3000000000000003|
+-----------+----------+-----------+----------+-------+------------------+
only showing top 5 rows



*Note:* If not `replace in-placed`.

In [54]:
schema_7.show(5) # don't have new column.

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|       null|      null|       null|      null|   null|
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



#### 2.8. `withColumnRenamed()` method

In [55]:
schema_7.withColumnRenamed('species', 'types').show(5)

+-----------+----------+-----------+----------+------+
|petalLength|petalWidth|sepalLength|sepalWidth| types|
+-----------+----------+-----------+----------+------+
|       null|      null|       null|      null|  null|
|        1.4|       0.2|        5.1|       3.5|setosa|
|        1.4|       0.2|        4.9|       3.0|setosa|
|        1.3|       0.2|        4.7|       3.2|setosa|
|        1.5|       0.2|        4.6|       3.1|setosa|
+-----------+----------+-----------+----------+------+
only showing top 5 rows



#### 2.9. `drop` method

In [56]:
schema_7.drop('species').show(5)

+-----------+----------+-----------+----------+
|petalLength|petalWidth|sepalLength|sepalWidth|
+-----------+----------+-----------+----------+
|       null|      null|       null|      null|
|        1.4|       0.2|        5.1|       3.5|
|        1.4|       0.2|        4.9|       3.0|
|        1.3|       0.2|        4.7|       3.2|
|        1.5|       0.2|        4.6|       3.1|
+-----------+----------+-----------+----------+
only showing top 5 rows



#### 2.10. `dropDuplicates` method

In [57]:
schema_7.count()

152

In [63]:
schema_7.dropDuplicates().count()

150

mean 2 duplicates row in this DataFrame.

#### 2.11. `dropna` method

In [64]:
schema_7.dropna().count()

150

mean 2 rows have null value.

#### 2.12. `filter` or `where` condtion
filter data in DataFrame.

column value start with `v`

In [67]:
schema_7.where(schema_7.species.like('v%')).show(5)

+-----------+----------+-----------+----------+----------+
|petalLength|petalWidth|sepalLength|sepalWidth|   species|
+-----------+----------+-----------+----------+----------+
|        4.7|       1.4|        7.0|       3.2|versicolor|
|        4.5|       1.5|        6.4|       3.2|versicolor|
|        4.9|       1.5|        6.9|       3.1|versicolor|
|        4.0|       1.3|        5.5|       2.3|versicolor|
|        4.6|       1.5|        6.5|       2.8|versicolor|
+-----------+----------+-----------+----------+----------+
only showing top 5 rows



*Note:* like is sensitive case.

In [70]:
schema_7.where(schema_7.species.like('V%')).show(5)

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
+-----------+----------+-----------+----------+-------+



`contains` method: get value with string column contains with specific value.

In [72]:
schema_7.where(schema_7.species.contains('setosa')).show(5)

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
|        1.4|       0.2|        5.0|       3.6| setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



`~`: inverse condition.

In [73]:
schema_7.where(~schema_7.species.contains('setosa')).show(5)

+-----------+----------+-----------+----------+----------+
|petalLength|petalWidth|sepalLength|sepalWidth|   species|
+-----------+----------+-----------+----------+----------+
|        4.7|       1.4|        7.0|       3.2|versicolor|
|        4.5|       1.5|        6.4|       3.2|versicolor|
|        4.9|       1.5|        6.9|       3.1|versicolor|
|        4.0|       1.3|        5.5|       2.3|versicolor|
|        4.6|       1.5|        6.5|       2.8|versicolor|
+-----------+----------+-----------+----------+----------+
only showing top 5 rows



In [78]:
schema_7.where(schema_7.sepalLength > 7.0).show(5)

+-----------+----------+-----------+----------+---------+
|petalLength|petalWidth|sepalLength|sepalWidth|  species|
+-----------+----------+-----------+----------+---------+
|        5.9|       2.1|        7.1|       3.0|virginica|
|        6.6|       2.1|        7.6|       3.0|virginica|
|        6.3|       1.8|        7.3|       2.9|virginica|
|        6.1|       2.5|        7.2|       3.6|virginica|
|        6.7|       2.2|        7.7|       3.8|virginica|
+-----------+----------+-----------+----------+---------+
only showing top 5 rows



In [79]:
schema_7.where((schema_7.sepalLength > 7.0) & (schema_7.petalWidth < 2.0)).show(5)

+-----------+----------+-----------+----------+---------+
|petalLength|petalWidth|sepalLength|sepalWidth|  species|
+-----------+----------+-----------+----------+---------+
|        6.3|       1.8|        7.3|       2.9|virginica|
|        6.0|       1.8|        7.2|       3.2|virginica|
|        5.8|       1.6|        7.2|       3.0|virginica|
|        6.1|       1.9|        7.4|       2.8|virginica|
+-----------+----------+-----------+----------+---------+



#### 2.13. Column string transformation

In [82]:
from pyspark.sql.functions import upper, lower

In [81]:
schema_7.select([upper('species'), 'petalLength', 'petalWidth']).show(5)

+--------------+-----------+----------+
|upper(species)|petalLength|petalWidth|
+--------------+-----------+----------+
|          null|       null|      null|
|        SETOSA|        1.4|       0.2|
|        SETOSA|        1.4|       0.2|
|        SETOSA|        1.3|       0.2|
|        SETOSA|        1.5|       0.2|
+--------------+-----------+----------+
only showing top 5 rows



In [86]:
schema_7.withColumn('upper', upper('species')).show(5)

+-----------+----------+-----------+----------+-------+------+
|petalLength|petalWidth|sepalLength|sepalWidth|species| upper|
+-----------+----------+-----------+----------+-------+------+
|       null|      null|       null|      null|   null|  null|
|        1.4|       0.2|        5.1|       3.5| setosa|SETOSA|
|        1.4|       0.2|        4.9|       3.0| setosa|SETOSA|
|        1.3|       0.2|        4.7|       3.2| setosa|SETOSA|
|        1.5|       0.2|        4.6|       3.1| setosa|SETOSA|
+-----------+----------+-----------+----------+-------+------+
only showing top 5 rows



#### 2.14. Convert data type

data types in `pyspark.sql.types`.

In [88]:
from pyspark.sql.types import IntegerType

In [90]:
schema_7.withColumn('interger_sepalWidth', schema_7.sepalWidth.cast(IntegerType())).show(5)

+-----------+----------+-----------+----------+-------+-------------------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|interger_sepalWidth|
+-----------+----------+-----------+----------+-------+-------------------+
|       null|      null|       null|      null|   null|               null|
|        1.4|       0.2|        5.1|       3.5| setosa|                  3|
|        1.4|       0.2|        4.9|       3.0| setosa|                  3|
|        1.3|       0.2|        4.7|       3.2| setosa|                  3|
|        1.5|       0.2|        4.6|       3.1| setosa|                  3|
+-----------+----------+-----------+----------+-------+-------------------+
only showing top 5 rows



In [91]:
csv_file_path = os.path.join('..', 'src-data', 'walmart_stock.csv')
schema_8 = spark.read.option('header', True).csv(csv_file_path)
schema_8.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



use `substring` to process string. read [here](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.substring.html)

In [92]:
from pyspark.sql.functions import substring
schema_8.withColumn('year', substring('Date', pos = 0, len = 4).cast(IntegerType())).show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+----+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|year|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|2012|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|2012|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|2012|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|2012|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|2012|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
only showing top 5 rows



### 3. Conditional clause

#### 3.1. `when(<if condition>, <then>)`

In [93]:
from pyspark.sql.functions import when

In [97]:
schema_7.select('sepalWidth', when(schema_7.sepalWidth <= 3.0, 'short')).show(5)

+----------+--------------------------------------------+
|sepalWidth|CASE WHEN (sepalWidth <= 3.0) THEN short END|
+----------+--------------------------------------------+
|      null|                                        null|
|       3.5|                                        null|
|       3.0|                                       short|
|       3.2|                                        null|
|       3.1|                                        null|
+----------+--------------------------------------------+
only showing top 5 rows



#### 3.2. Multiple `when()`

In [98]:
schema_7.select('sepalWidth', when(schema_7.sepalWidth <= 3.0, 'short')
                .when(schema_7.sepalWidth <= 3.3, 'medium')).show(5)

+----------+---------------------------------------------------------------------------------+
|sepalWidth|CASE WHEN (sepalWidth <= 3.0) THEN short WHEN (sepalWidth <= 3.3) THEN medium END|
+----------+---------------------------------------------------------------------------------+
|      null|                                                                             null|
|       3.5|                                                                             null|
|       3.0|                                                                            short|
|       3.2|                                                                           medium|
|       3.1|                                                                           medium|
+----------+---------------------------------------------------------------------------------+
only showing top 5 rows



#### 3.3. `otherwise()`

In [99]:
schema_7.select('sepalWidth', when(schema_7.sepalWidth <= 3.0, 'short')
                .when(schema_7.sepalWidth <= 3.3, 'medium')
                .otherwise('long')).show(5)

+----------+-------------------------------------------------------------------------------------------+
|sepalWidth|CASE WHEN (sepalWidth <= 3.0) THEN short WHEN (sepalWidth <= 3.3) THEN medium ELSE long END|
+----------+-------------------------------------------------------------------------------------------+
|      null|                                                                                       long|
|       3.5|                                                                                       long|
|       3.0|                                                                                      short|
|       3.2|                                                                                     medium|
|       3.1|                                                                                     medium|
+----------+-------------------------------------------------------------------------------------------+
only showing top 5 rows



#### 3.4. User Defined Function

In PySpark, the function of python is escaptulated in `pyspark.sql.function.udf`. Click [here](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.udf.html)

In [102]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [101]:
def classify_width(width):
    width_type = None
    if width <= 3.0:
        width_type = 'short'
    elif width <= 3.3:
        width_type = 'medium'
    else:
        width_type = 'long'
    
    return width_type

In [103]:
udf_classify_width = udf(classify_width, StringType())

In [107]:
schema_9 = schema_7.dropna() # drop the row which null

In [108]:
schema_9.withColumn('classify_width', udf_classify_width(schema_7.sepalWidth)).show(5)

+-----------+----------+-----------+----------+-------+--------------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|classify_width|
+-----------+----------+-----------+----------+-------+--------------+
|        1.4|       0.2|        5.1|       3.5| setosa|          long|
|        1.4|       0.2|        4.9|       3.0| setosa|         short|
|        1.3|       0.2|        4.7|       3.2| setosa|        medium|
|        1.5|       0.2|        4.6|       3.1| setosa|        medium|
|        1.4|       0.2|        5.0|       3.6| setosa|          long|
+-----------+----------+-----------+----------+-------+--------------+
only showing top 5 rows



### 4. Visualize data

- Convert pyspark DataFrame to `Pandas`.
- Apply common packages as `matplotlib` and `seaborn`.