In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("10-Dataframe-ETL-Transformations-2").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
spark

In [5]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [6]:
sc.defaultParallelism

8

In [7]:
spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)

In [8]:
spark.conf.get("spark.sql.codegen.wholeStage")

'true'

In [9]:
from pyspark.sql import functions as F

# 1. Window Functions:-
--------------------------
- For applying window functions on dataframes, we have the classes ***Window*** and ***WindowSpec***, in the ***pyspark.sql.window*** module.
- Both the above classes has 4 main APIs/methods: ***partitionBy(), orderBy(), rangeBetween(), rowsBetween()***
- On applying any of the above 4 APIs(with suitable parameters) on the ***Window*** object, a ***WindowSpec*** object is returned.
- However, the windows functions like ***row_number(), rank(), dense_rank()*** are in the ***pyspark.sql.functions*** module. These functions take ***WindowSpec*** object as parameter of the ***over()*** method, that these window functions has.

In [10]:
import pyspark

help( pyspark.sql.window )

Help on module pyspark.sql.window in pyspark.sql:

NAME
    pyspark.sql.window

DESCRIPTION
    # Licensed to the Apache Software Foundation (ASF) under one or more
    # contributor license agreements.  See the NOTICE file distributed with
    # this work for additional information regarding copyright ownership.
    # The ASF licenses this file to You under the Apache License, Version 2.0
    # (the "License"); you may not use this file except in compliance with
    # the License.  You may obtain a copy of the License at
    #
    #    http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

CLASSES
    builtins.object
        Window
        WindowSpec
   

## Ranking Window Functions:-
-------------------------------
- ***row_number()***
- ***rank()***
- ***dense_rank()***
- ***percent_rank()***
- ***ntile(n)***
- ***cume_dist()***

In [11]:
emp_data = [
    ("James", "Sales","NY",9000,34),
    ("Alicia", "Sales","NY",8600,56), 
    ("Robert","Sales","CA",8100,30),
    ("John", "Sales","AZ", 8600,31),
    ("Ross","Sales","AZ",8100,33),
    ("Kathy", "Sales", "AZ", 1000, 39),
    ("Lisa","Finance", "CA", 9000,24),
    ("Deja", "Finance","CA",9900,40), 
    ("Sugie","Finance","NY",8300,36),
    ("Ram", "Finance","NY",7900,53),
    ("Satya", "Finance", "AZ", 8200, 53),
    ("Kyle", "Marketing","CA",8000,25),
    ("Reid", "Marketing","NY",9100,50)
]

emp_schema = ["empname","dept","state","salary","age"]

In [12]:
emp_df = spark.createDataFrame( data = emp_data, schema = emp_schema )

In [15]:
emp_df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Ross|    Sales|   AZ|  8100| 33|
|  Kathy|    Sales|   AZ|  1000| 39|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|  Satya|  Finance|   AZ|  8200| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



### Apply ranking on the salaries of each department, with the highest salary being ranked at the top:
<hr>

As we are taking salary department wise, partitioning will be based on ***dept*** column, and as ranking is based on salaries, thus ordering will be done based on ***ssalary*** column.

In [14]:
from pyspark.sql.window import Window, WindowSpec

In [21]:
salary_spec_obj = Window.partitionBy( emp_df.dept ).orderBy( emp_df.salary.desc() )
type( salary_spec_obj )

pyspark.sql.window.WindowSpec

In [23]:
emp_df.drop("state", "age").withColumn( "row_number_dept_wise_salary", F.row_number().over( salary_spec_obj ) ).show()

+-------+---------+------+---------------------------+
|empname|     dept|salary|row_number_dept_wise_salary|
+-------+---------+------+---------------------------+
|   Deja|  Finance|  9900|                          1|
|   Lisa|  Finance|  9000|                          2|
|  Sugie|  Finance|  8300|                          3|
|  Satya|  Finance|  8200|                          4|
|    Ram|  Finance|  7900|                          5|
|   Reid|Marketing|  9100|                          1|
|   Kyle|Marketing|  8000|                          2|
|  James|    Sales|  9000|                          1|
| Alicia|    Sales|  8600|                          2|
|   John|    Sales|  8600|                          3|
| Robert|    Sales|  8100|                          4|
|   Ross|    Sales|  8100|                          5|
|  Kathy|    Sales|  1000|                          6|
+-------+---------+------+---------------------------+



In [24]:
emp_df.drop("state","age").withColumn("rank_salary", F.rank().over(salary_spec_obj) ).show()

+-------+---------+------+-----------+
|empname|     dept|salary|rank_salary|
+-------+---------+------+-----------+
|   Deja|  Finance|  9900|          1|
|   Lisa|  Finance|  9000|          2|
|  Sugie|  Finance|  8300|          3|
|  Satya|  Finance|  8200|          4|
|    Ram|  Finance|  7900|          5|
|   Reid|Marketing|  9100|          1|
|   Kyle|Marketing|  8000|          2|
|  James|    Sales|  9000|          1|
| Alicia|    Sales|  8600|          2|
|   John|    Sales|  8600|          2|
| Robert|    Sales|  8100|          4|
|   Ross|    Sales|  8100|          4|
|  Kathy|    Sales|  1000|          6|
+-------+---------+------+-----------+



In [26]:
emp_df.drop("state","age").withColumn("dense_rank_salary_deptwise", F.dense_rank().over(salary_spec_obj) ).show()

+-------+---------+------+--------------------------+
|empname|     dept|salary|dense_rank_salary_deptwise|
+-------+---------+------+--------------------------+
|   Deja|  Finance|  9900|                         1|
|   Lisa|  Finance|  9000|                         2|
|  Sugie|  Finance|  8300|                         3|
|  Satya|  Finance|  8200|                         4|
|    Ram|  Finance|  7900|                         5|
|   Reid|Marketing|  9100|                         1|
|   Kyle|Marketing|  8000|                         2|
|  James|    Sales|  9000|                         1|
| Alicia|    Sales|  8600|                         2|
|   John|    Sales|  8600|                         2|
| Robert|    Sales|  8100|                         3|
|   Ross|    Sales|  8100|                         3|
|  Kathy|    Sales|  1000|                         4|
+-------+---------+------+--------------------------+



In [27]:
emp_df.drop("state","age").withColumn("percent_rank", F.percent_rank().over(salary_spec_obj) ).show()

+-------+---------+------+------------+
|empname|     dept|salary|percent_rank|
+-------+---------+------+------------+
|   Deja|  Finance|  9900|         0.0|
|   Lisa|  Finance|  9000|        0.25|
|  Sugie|  Finance|  8300|         0.5|
|  Satya|  Finance|  8200|        0.75|
|    Ram|  Finance|  7900|         1.0|
|   Reid|Marketing|  9100|         0.0|
|   Kyle|Marketing|  8000|         1.0|
|  James|    Sales|  9000|         0.0|
| Alicia|    Sales|  8600|         0.2|
|   John|    Sales|  8600|         0.2|
| Robert|    Sales|  8100|         0.6|
|   Ross|    Sales|  8100|         0.6|
|  Kathy|    Sales|  1000|         1.0|
+-------+---------+------+------------+



In [32]:
# ntile(n) divides group into 'n' parts

emp_df.drop("state","age").withColumn("ntile", F.ntile(2).over(salary_spec_obj)).show()

+-------+---------+------+-----+
|empname|     dept|salary|ntile|
+-------+---------+------+-----+
|   Deja|  Finance|  9900|    1|
|   Lisa|  Finance|  9000|    1|
|  Sugie|  Finance|  8300|    1|
|  Satya|  Finance|  8200|    2|
|    Ram|  Finance|  7900|    2|
|   Reid|Marketing|  9100|    1|
|   Kyle|Marketing|  8000|    2|
|  James|    Sales|  9000|    1|
| Alicia|    Sales|  8600|    1|
|   John|    Sales|  8600|    1|
| Robert|    Sales|  8100|    2|
|   Ross|    Sales|  8100|    2|
|  Kathy|    Sales|  1000|    2|
+-------+---------+------+-----+



In [33]:
emp_df.drop("state","age").withColumn("cume_dist", F.cume_dist().over(salary_spec_obj) ).show()

+-------+---------+------+-------------------+
|empname|     dept|salary|          cume_dist|
+-------+---------+------+-------------------+
|   Deja|  Finance|  9900|                0.2|
|   Lisa|  Finance|  9000|                0.4|
|  Sugie|  Finance|  8300|                0.6|
|  Satya|  Finance|  8200|                0.8|
|    Ram|  Finance|  7900|                1.0|
|   Reid|Marketing|  9100|                0.5|
|   Kyle|Marketing|  8000|                1.0|
|  James|    Sales|  9000|0.16666666666666666|
| Alicia|    Sales|  8600|                0.5|
|   John|    Sales|  8600|                0.5|
| Robert|    Sales|  8100| 0.8333333333333334|
|   Ross|    Sales|  8100| 0.8333333333333334|
|  Kathy|    Sales|  1000|                1.0|
+-------+---------+------+-------------------+



## Analytical Window Functions:-
----------------------------------
- ***lead()***
- ***lag()***

In [16]:
emp_df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Ross|    Sales|   AZ|  8100| 33|
|  Kathy|    Sales|   AZ|  1000| 39|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|  Satya|  Finance|   AZ|  8200| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



In [17]:
from pyspark.sql.window import Window, WindowSpec

In [19]:
dept_sal_spec = Window.partitionBy( "dept" ).orderBy( "salary" )

In [18]:
dept_sal_df = emp_df.drop("state", "age")
dept_sal_df.show()

+-------+---------+------+
|empname|     dept|salary|
+-------+---------+------+
|  James|    Sales|  9000|
| Alicia|    Sales|  8600|
| Robert|    Sales|  8100|
|   John|    Sales|  8600|
|   Ross|    Sales|  8100|
|  Kathy|    Sales|  1000|
|   Lisa|  Finance|  9000|
|   Deja|  Finance|  9900|
|  Sugie|  Finance|  8300|
|    Ram|  Finance|  7900|
|  Satya|  Finance|  8200|
|   Kyle|Marketing|  8000|
|   Reid|Marketing|  9100|
+-------+---------+------+



### With default offset:

In [20]:
dept_sal_df.withColumn("lead_salary_deptwise", F.lead("salary").over(dept_sal_spec) ) \
           .withColumn("lag_salary_deptwise", F.lag("salary").over(dept_sal_spec) ) \
           .show()

+-------+---------+------+--------------------+-------------------+
|empname|     dept|salary|lead_salary_deptwise|lag_salary_deptwise|
+-------+---------+------+--------------------+-------------------+
|    Ram|  Finance|  7900|                8200|               null|
|  Satya|  Finance|  8200|                8300|               7900|
|  Sugie|  Finance|  8300|                9000|               8200|
|   Lisa|  Finance|  9000|                9900|               8300|
|   Deja|  Finance|  9900|                null|               9000|
|   Kyle|Marketing|  8000|                9100|               null|
|   Reid|Marketing|  9100|                null|               8000|
|  Kathy|    Sales|  1000|                8100|               null|
| Robert|    Sales|  8100|                8100|               1000|
|   Ross|    Sales|  8100|                8600|               8100|
| Alicia|    Sales|  8600|                8600|               8100|
|   John|    Sales|  8600|                9000| 

### With custom offset:

In [22]:
dept_sal_df.withColumn( "lead_salary_deptwise", F.lead("salary", 2).over( dept_sal_spec ) ) \
           .withColumn( "lag_salary_deptwise", F.lag("salary", 2).over(dept_sal_spec) ) \
           .show()

+-------+---------+------+--------------------+-------------------+
|empname|     dept|salary|lead_salary_deptwise|lag_salary_deptwise|
+-------+---------+------+--------------------+-------------------+
|    Ram|  Finance|  7900|                8300|               null|
|  Satya|  Finance|  8200|                9000|               null|
|  Sugie|  Finance|  8300|                9900|               7900|
|   Lisa|  Finance|  9000|                null|               8200|
|   Deja|  Finance|  9900|                null|               8300|
|   Kyle|Marketing|  8000|                null|               null|
|   Reid|Marketing|  9100|                null|               null|
|  Kathy|    Sales|  1000|                8100|               null|
| Robert|    Sales|  8100|                8600|               null|
|   Ross|    Sales|  8100|                8600|               1000|
| Alicia|    Sales|  8600|                9000|               8100|
|   John|    Sales|  8600|                null| 

### With default value for *NULL* :
------------------------------------

In [23]:
dept_sal_df.withColumn( "lead_salary_deptwise", F.lead("salary", 2, -1).over( dept_sal_spec ) ) \
           .withColumn( "lag_salary_deptwise", F.lag("salary", 2, -1).over(dept_sal_spec) ) \
           .show()

+-------+---------+------+--------------------+-------------------+
|empname|     dept|salary|lead_salary_deptwise|lag_salary_deptwise|
+-------+---------+------+--------------------+-------------------+
|    Ram|  Finance|  7900|                8300|                 -1|
|  Satya|  Finance|  8200|                9000|                 -1|
|  Sugie|  Finance|  8300|                9900|               7900|
|   Lisa|  Finance|  9000|                  -1|               8200|
|   Deja|  Finance|  9900|                  -1|               8300|
|   Kyle|Marketing|  8000|                  -1|                 -1|
|   Reid|Marketing|  9100|                  -1|                 -1|
|  Kathy|    Sales|  1000|                8100|                 -1|
| Robert|    Sales|  8100|                8600|                 -1|
|   Ross|    Sales|  8100|                8600|               1000|
| Alicia|    Sales|  8600|                9000|               8100|
|   John|    Sales|  8600|                  -1| 

### Aggregate Window functions:-
----------------------------------
The same functions used with groupBy() can be used as window functions also. They must be applied with the ***WindowSpec*** object using the ***over()*** method.

The only difference will be that, instead of getting the output of each group as one record, for each record, aggregated value will be shown, which will be same over a group.

In [25]:
dept_sal_spec = Window.partitionBy( "dept" ).orderBy( "salary" )

For ***first()*** and ***last()***, orderBy() is must.

In [42]:
dept_sal_df.withColumn("min_salary", F.first("salary").over(dept_sal_spec) ) \
           .withColumn("max_salary_cumulative", F.last("salary").over(dept_sal_spec) ) \
           .show()

+-------+---------+------+----------+---------------------+
|empname|     dept|salary|min_salary|max_salary_cumulative|
+-------+---------+------+----------+---------------------+
|    Ram|  Finance|  7900|      7900|                 7900|
|  Satya|  Finance|  8200|      7900|                 8200|
|  Sugie|  Finance|  8300|      7900|                 8300|
|   Lisa|  Finance|  9000|      7900|                 9000|
|   Deja|  Finance|  9900|      7900|                 9900|
|   Kyle|Marketing|  8000|      8000|                 8000|
|   Reid|Marketing|  9100|      8000|                 9100|
|  Kathy|    Sales|  1000|      1000|                 1000|
| Robert|    Sales|  8100|      1000|                 8100|
|   Ross|    Sales|  8100|      1000|                 8100|
| Alicia|    Sales|  8600|      1000|                 8600|
|   John|    Sales|  8600|      1000|                 8600|
|  James|    Sales|  9000|      1000|                 9000|
+-------+---------+------+----------+---

In [30]:
dept_sal_df.withColumn("Cumulative_sum_of_salary", F.sum("salary").over(dept_sal_spec) ).show()

+-------+---------+------+------------------------+
|empname|     dept|salary|Cumulative_sum_of_salary|
+-------+---------+------+------------------------+
|    Ram|  Finance|  7900|                    7900|
|  Satya|  Finance|  8200|                   16100|
|  Sugie|  Finance|  8300|                   24400|
|   Lisa|  Finance|  9000|                   33400|
|   Deja|  Finance|  9900|                   43300|
|   Kyle|Marketing|  8000|                    8000|
|   Reid|Marketing|  9100|                   17100|
|  Kathy|    Sales|  1000|                    1000|
| Robert|    Sales|  8100|                   17200|
|   Ross|    Sales|  8100|                   17200|
| Alicia|    Sales|  8600|                   34400|
|   John|    Sales|  8600|                   34400|
|  James|    Sales|  9000|                   43400|
+-------+---------+------+------------------------+



## If we apply ***orderBy()*** on the window, and then ***sum()***, it yields cumulative sum for each group like this, as above.

To get same total for each record of a group, we **must not use *orderBy()***

In [29]:
sum_spec = Window.partitionBy("dept")

In [31]:
dept_sal_df.withColumn( "dept_total_salary", F.sum("salary").over(sum_spec) ).show()

+-------+---------+------+-----------------+
|empname|     dept|salary|dept_total_salary|
+-------+---------+------+-----------------+
|   Lisa|  Finance|  9000|            43300|
|   Deja|  Finance|  9900|            43300|
|  Sugie|  Finance|  8300|            43300|
|    Ram|  Finance|  7900|            43300|
|  Satya|  Finance|  8200|            43300|
|   Kyle|Marketing|  8000|            17100|
|   Reid|Marketing|  9100|            17100|
|  James|    Sales|  9000|            43400|
| Alicia|    Sales|  8600|            43400|
| Robert|    Sales|  8100|            43400|
|   John|    Sales|  8600|            43400|
|   Ross|    Sales|  8100|            43400|
|  Kathy|    Sales|  1000|            43400|
+-------+---------+------+-----------------+



In [38]:
dept_sal_df.drop("empname").select(
    "*",
    F.min("salary").over(sum_spec).alias("min"),
    F.max("salary").over(sum_spec).alias("max"),
    F.avg("salary").over(sum_spec).alias("avg"),
    F.count("salary").over(sum_spec).alias("count"),
    F.sum("salary").over(sum_spec).alias("sum")
).show()

+---------+------+----+----+-----------------+-----+-----+
|     dept|salary| min| max|              avg|count|  sum|
+---------+------+----+----+-----------------+-----+-----+
|  Finance|  9000|7900|9900|           8660.0|    5|43300|
|  Finance|  9900|7900|9900|           8660.0|    5|43300|
|  Finance|  8300|7900|9900|           8660.0|    5|43300|
|  Finance|  7900|7900|9900|           8660.0|    5|43300|
|  Finance|  8200|7900|9900|           8660.0|    5|43300|
|Marketing|  8000|8000|9100|           8550.0|    2|17100|
|Marketing|  9100|8000|9100|           8550.0|    2|17100|
|    Sales|  9000|1000|9000|7233.333333333333|    6|43400|
|    Sales|  8600|1000|9000|7233.333333333333|    6|43400|
|    Sales|  8100|1000|9000|7233.333333333333|    6|43400|
|    Sales|  8600|1000|9000|7233.333333333333|    6|43400|
|    Sales|  8100|1000|9000|7233.333333333333|    6|43400|
|    Sales|  1000|1000|9000|7233.333333333333|    6|43400|
+---------+------+----+----+-----------------+-----+----

### Other window functions(specific to pyspark Window class):-
---------------------------------------------------------------
- ***rangeBetween( start, end )*** 
- ***rowsBetween()***

These methods are applied to the **Window** object while getting the **WindowSpec** object, that is being passed as the parameter to the ***over()*** method. The above two methods can only be applied, when ***orderBy()*** has been applied.

In [43]:
dept_sal_df.show()

+-------+---------+------+
|empname|     dept|salary|
+-------+---------+------+
|  James|    Sales|  9000|
| Alicia|    Sales|  8600|
| Robert|    Sales|  8100|
|   John|    Sales|  8600|
|   Ross|    Sales|  8100|
|  Kathy|    Sales|  1000|
|   Lisa|  Finance|  9000|
|   Deja|  Finance|  9900|
|  Sugie|  Finance|  8300|
|    Ram|  Finance|  7900|
|  Satya|  Finance|  8200|
|   Kyle|Marketing|  8000|
|   Reid|Marketing|  9100|
+-------+---------+------+



In [44]:
spec_range1 = Window.partitionBy("dept").orderBy("salary").rangeBetween( Window.unboundedPreceding, Window.unboundedFollowing )
spec_range2 = Window.partitionBy("dept").orderBy("salary").rangeBetween( Window.unboundedPreceding, Window.currentRow )
spec_range3 = Window.partitionBy("dept").orderBy("salary").rangeBetween( Window.currentRow, Window.unboundedFollowing )

In [45]:
dept_sal_df.select(
    "*",
    F.sum("salary").over(spec_range1).alias("total_range1"),
    F.sum("salary").over(spec_range2).alias("total_range2"),
    F.sum("salary").over(spec_range3).alias("total_range3")
).show()

+-------+---------+------+------------+------------+------------+
|empname|     dept|salary|total_range1|total_range2|total_range3|
+-------+---------+------+------------+------------+------------+
|    Ram|  Finance|  7900|       43300|        7900|       43300|
|  Satya|  Finance|  8200|       43300|       16100|       35400|
|  Sugie|  Finance|  8300|       43300|       24400|       27200|
|   Lisa|  Finance|  9000|       43300|       33400|       18900|
|   Deja|  Finance|  9900|       43300|       43300|        9900|
|   Kyle|Marketing|  8000|       17100|        8000|       17100|
|   Reid|Marketing|  9100|       17100|       17100|        9100|
|  Kathy|    Sales|  1000|       43400|        1000|       43400|
| Robert|    Sales|  8100|       43400|       17200|       42400|
|   Ross|    Sales|  8100|       43400|       17200|       42400|
| Alicia|    Sales|  8600|       43400|       34400|       26200|
|   John|    Sales|  8600|       43400|       34400|       26200|
|  James| 

In [46]:
spec_row1 = Window.partitionBy("dept").orderBy("salary").rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing )
spec_row2 = Window.partitionBy("dept").orderBy("salary").rowsBetween( Window.unboundedPreceding, Window.currentRow )
spec_row3 = Window.partitionBy("dept").orderBy("salary").rowsBetween( Window.currentRow, Window.unboundedFollowing )

In [47]:
dept_sal_df.select(
    "*",
    F.sum("salary").over(spec_row1).alias("total_row1"),
    F.sum("salary").over(spec_row2).alias("total_row2"),
    F.sum("salary").over(spec_row3).alias("total_row3")
).show()

+-------+---------+------+----------+----------+----------+
|empname|     dept|salary|total_row1|total_row2|total_row3|
+-------+---------+------+----------+----------+----------+
|    Ram|  Finance|  7900|     43300|      7900|     43300|
|  Satya|  Finance|  8200|     43300|     16100|     35400|
|  Sugie|  Finance|  8300|     43300|     24400|     27200|
|   Lisa|  Finance|  9000|     43300|     33400|     18900|
|   Deja|  Finance|  9900|     43300|     43300|      9900|
|   Kyle|Marketing|  8000|     17100|      8000|     17100|
|   Reid|Marketing|  9100|     17100|     17100|      9100|
|  Kathy|    Sales|  1000|     43400|      1000|     43400|
| Robert|    Sales|  8100|     43400|      9100|     42400|
|   Ross|    Sales|  8100|     43400|     17200|     34300|
| Alicia|    Sales|  8600|     43400|     25800|     26200|
|   John|    Sales|  8600|     43400|     34400|     17600|
|  James|    Sales|  9000|     43400|     43400|      9000|
+-------+---------+------+----------+---

# 2. Sampling:-
----------------
- ***sample(withReplacement=None, fraction=None, seed=None)*** - It is a transformation, i.e, it returns a Dataframe, and we need to apply ***show()*** to view the results.

For RDD, we also have another function for sampling, called ***takeSample()***, and it was an action, but it does not exist for DataFrames.

In [48]:
emp_df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Ross|    Sales|   AZ|  8100| 33|
|  Kathy|    Sales|   AZ|  1000| 39|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|  Satya|  Finance|   AZ|  8200| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



In [51]:
emp_df.sample(True, 0.5).show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Alicia|    Sales|   NY|  8600| 56|
|  Kathy|    Sales|   AZ|  1000| 39|
|  Kathy|    Sales|   AZ|  1000| 39|
|   Kyle|Marketing|   CA|  8000| 25|
+-------+---------+-----+------+---+



#### When the 1st parameter is true, duplicate values may occur in the result set, even if the value is present only once in the main dataframe.

In [53]:
emp_df.sample(False, 0.5).show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Robert|    Sales|   CA|  8100| 30|
|   Ross|    Sales|   AZ|  8100| 33|
|  Kathy|    Sales|   AZ|  1000| 39|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|  Satya|  Finance|   AZ|  8200| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



AttributeError: 'DataFrame' object has no attribute 'takeSample'