In [None]:
#  Last amended: 11th Nov, 2021
#  Myfolder: /home/ashok/Documents/spark
# Ref:
# Tutorials (slightly dated):
#      https://changhsinlee.com/pyspark-dataframe-basics/
#      https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/
# Cheat Sheet
#      https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf

#  Objectives:
#           Dataframe operations in spark cluster

### pyspark APIs<br>
> i)  [DataFrame APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis)<br>
>> df.select(columnName).where(colObject > 30).orderBy(desc(columnName))<br>
>> df.select(columnName).where("colName > 30").orderBy(desc(columnName))<br>

> ii) [Column APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#column-apis)<br>
>> df.select(ColumnObject.API)<br>
>> df.select(df.age.isNull())<br>
>> df.select(df["age"].isNull())<br>
>> df.select(col("age").isNull())<br>

> iii)[Data Tyoes](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types)<br>
> iv) [Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#functions)<br>
>> df.select(sum("age"))<br>
>> df.select(sum(col(booleanColumn).cast("int")))<br>
>> <u>but you must import the functions</u>

> v)  [Grouping](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#grouping)<br>
>> Using inbuilt functions
>>> df.groupby.agg({'age' : 'mean' , 'ht' : 'sum'})<br>  

>> Using pyspark.sql.functions<br>
>>> df.groupby.agg(sum(df.age), min(col("purchase")))

A. Initial operations:
1.0 Start hadoop in a terminal:

            ./allstart.sh

## Transfer files to hadoop

In [None]:
# 1.1 Transfer two data files, airports.csv and  weather.csv,
#         from local file system to hadoop. 
#         COPY AND PASTE in terminal all the four commands below.


! cd ~
! hdfs dfs -mkdir -p  hdfs://localhost:9000/user/ashok/data_files/nycflights
! hdfs dfs -put /cdata/nycflights13/airports.csv.gz hdfs://localhost:9000/user/ashok/data_files/nycflights
! hdfs dfs -put /cdata/nycflights13/weather.csv.gz hdfs://localhost:9000/user/ashok/data_files/nycflights
! hdfs dfs -ls -h hdfs://localhost:9000/user/ashok/data_files/nycflights



In [None]:
# 1.2 Start pyspark in terminal. Its starting creates a context 'sc' and session 'spark'
#     Jupyter notebook will also start then. pyspark is using juptyter notebook as interface
#     COPY AND PASTE FOLLOWING TWO COMMANDS
"""

cd /home/ashok/Documents/spark/1.basics/
pysparknb

"""

## Set jupyter notebook options
Start pyspark with jupyter notebook interface. There is no need to create SparkContext and Spark session. pyspark creates them when starting.

In [None]:
# 1.3 Display multiple outputs from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.4 Increase cell width to display wide columnar output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### SparkSession vs SparkContext
pyspark creates both SparkContext and SparkSession. SparkContext is now historical. Spark session is a unified entry point of a spark application from Spark 2.0. It provides a way to interact with various spark’s functionality with a lesser number of constructs. Instead of having a spark context, hive context, SQL context, now all of it is encapsulated in a Spark session.
<a href=https://medium.com/@achilleus/spark-session-10d0d66d1d24>See this link.</a><br>
SparkContext, SQLContext and HiveContext are available still. Only that these are accessible through SparkSession automatically without there being need to explicitly create them. <br>
A SparkConext is used by Driver program:<br>
    - The driver program use the SparkContext to connect and communicate with the cluster and it helps in executing and coordinating the Spark job with the resource managers like YARN or Mesos.
    Using SparkContext you can actually get access to other contexts like  SQLContext and HiveContext.
    Using SparkContext we can set configuration parameters to the Spark job.

In [None]:
# 1.5   To check if spark context is ,sc, and spark session is, spark: 
#       try the following two commands. Also right-click on Spark_UI hyperlink
#       to open another site. Spark UI is available at localhost:4040

sc
spark
spark.sparkContext    # Get sparkContext

### More memory to Spark

In [None]:
# To allocate more memory to spark, start 
# this notebook as (and not as pysparknb):
# # See :  https://stackoverflow.com/a/62737941
# 
#     pyspark --driver-memory 3g 

In [1]:
import pyspark

In [9]:
# sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(spark)

### Spark DataFrame creation

In [10]:
# 1.6 Using 'spark' session, create a range of numbers 

myRange = spark.range(10000)

type(myRange)                 # pyspark.sql.dataframe.DataFrame

myRange.show(3)

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+
only showing top 3 rows



In [20]:
d = [{'name': 'Alice', 'age': 1}]
l=spark.createDataFrame(d)


In [11]:
# 1.7
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Union[float, NoneType] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of either :class:`Row`,
    :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or an exception will be thrown at runtime. If the 

## Dataframe creation

In [None]:
# 1. Create dataframe from a list of tuples/records
# 2. In addition, specify simple schema: column names
# 3. From a list of dictionaries
# 4. From pandas dataframe
# 5. From a csv file on hadoop
# Sample schema writing
#       schema = ("fname string, lname string, phone string, age double, travel date"),

### From simple records

In [23]:
# 2.0 Use createDataFrame() . This is preferred method.

#  2.1 From a list of tuples:
#      No schema. Column names are given on its own
#      Schema is inferred
df = spark.createDataFrame(
                            [
                                (11,12,31),
                                (4,5,6)
                            ]
                          )

# 2.1.1
# df.show()
df.dtypes


[('_1', 'bigint'), ('_2', 'bigint'), ('_3', 'bigint')]

In [None]:
# 2.2   Specify schema as just column names
#       the type of each column is inferred 
#       from data.

df = spark.createDataFrame(
                           [
                              (1,2,3.2),
                              (4,5,6.1)
                           ], 
                           schema = ["age","income","ht"] #   ("age int,income string, ht double"),
                           )

df.show()
df.dtypes

### From list of dict

In [None]:
# 2.3 From a list of dictionaries:

# 2.3.1   Just one row
#         Datatypes are array of 'bigint' and array of 'strings'
df = spark.createDataFrame(
                             [ 
                                 {
                                    'name':['Alice', 'kishan'],
                                    'age' : [22,44]
                                 }
                             ]
                          )

# 2.3.2
df.show()
df.dtypes

# 2.3.3  Two rows with list elements
#        Datatypes are array of 'bigint' and array of 'strings'
df = spark.createDataFrame(
                            [
                                {'name':['Alice'],
                                 'age' : [22]
                                },
                                
                                {'name':['kishan'],
                                 'age' : [44]
                                }
                            ]
                          )
# 2.3.4
df.show()
df.dtypes

# 2.3.5

df = spark.createDataFrame(
                            [
                               {
                                   'name':'Alice',
                                   'age' : 22
                               },
                               {
                                   'name':'kishan',
                                   'age' : 44
                               }
                             ]
                            )
# 2.3.6
df.show()
df.dtypes


### Using Row class

In [None]:
# 2.4 Using 'Row' object of dataset

from pyspark.sql import Row

row1 = Row(name='Alice',   age=5,  height=80) 
row2 = Row(name='Kailash', age=51, height=85) 
row3 = Row(name='Gamn',    age=10, height=None)
row4 = Row(name= 'Alice',   age=5, height=80)

# 2.7.1 Create dataframe now and show it
df = spark.createDataFrame([row1,row2,row3,row4])
df.show()

In [None]:
# 2.5 This mixture of DataTypes fails

"""
spark.createDataFrame(
                       [
                           {
                              'a' : 34.5,
                              'b' : 'elephant'
                           },
                           {
                              'a' : [37.8],
                              'b' : 'deer'
                           }
                      ]
                   ).show()
"""                   

### How to specify full schema?
Here is how you can specify full schema for a DataFrame.

In [None]:
# 2.6 How to specify full schema?
#     Ref: https://spark.apache.org/docs/2.4.4/api/python/pyspark.sql.html#pyspark.sql.types.StructType
#     Full schema includes:
#         i)  Name of columns
#         ii) Its datatype (that must be explicitly imported)
#        iii) Whether it can be null (True)
#             or not (False)
#     Schema is defined within StructType
#     Each StructType is a list of StructField
#     Each StructField is a tuple with above
#     three pieces of information
#
#     Sample. Do not execute
"""
from pyspark.sql.types import StructType, StructField
df = spark.createDataFrame(
                            [
                              (1,2,3)
                            ],
                            schema = StructType(
                                                 [
                                                  (StructField1),
                                                  (StructField2),
                                                  (StructField3),
                                                 ]
                                         
                                               )
                          )
df.show()
"""

In [None]:
# 2.7 Full example
#     Import necessary datatype classes
#     Ref: https://spark.apache.org/docs/2.4.4/api/python/pyspark.sql.html#pyspark.sql.types.StructType

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

# 2.6.1 Create an instance of class IntegerType

inttype = IntegerType()

df = spark.createDataFrame(
                            [
                              (1,2,True)
                            ],
                            schema = StructType(
                                                 [
                                                  StructField('a',inttype,True),
                                                  StructField('b',StringType() , True),
                                                  StructField('c',BooleanType(),True),
                                                 ]
                                         
                                               )
                          )
# 2.6.2
df.show()

### Pandas DataFrame to Spark DataFrame

In [24]:
# 3.0 Using Pandas

import pandas as pd
import numpy as np
pdf = pd.DataFrame({'age'     : np.random.randint(20,50,10000),    
                    'married' : np.random.randint(0,2,10000)
                    }
                  )

pdf.head(3)

Unnamed: 0,age,married
0,32,0
1,27,1
2,45,0


In [26]:
pdf.iloc[0,0] = np.nan
pdf.iloc[1,1] = np.nan
pdf.head(3)


Unnamed: 0,age,married
0,,0.0
1,27.0,
2,45.0,0.0


In [27]:
#  3.1   Transform it toa spark dataframe
#        Note that unlike pandas, there is no concept
#        of index here.

big = spark.createDataFrame(pdf)
big.show(3)
big.count()   # How many rows
big.dtypes

Py4JJavaError: An error occurred while calling o192.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 8) (DESKTOP-BG29R42 executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 29 more


In [None]:
# 3.2 Carry out a few DataFrame operations
big.dropna().show(3)

# 3.2.1
big.dropDuplicates().count()


In [31]:
# 3.3
help(big.sampleBy)

Help on method sampleBy in module pyspark.sql.dataframe:

sampleBy(col: 'ColumnOrName', fractions: Dict[Any, float], seed: Union[int, NoneType] = None) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a stratified sample without replacement based on the
    fraction given on each stratum.
    
    .. versionadded:: 1.5.0
    
    Parameters
    ----------
    col : :class:`Column` or str
        column that defines strata
    
        .. versionchanged:: 3.0
           Added sampling by a column of :class:`Column`
    fractions : dict
        sampling fraction for each stratum. If a stratum is not
        specified, we treat its fraction as zero.
    seed : int, optional
        random seed
    
    Returns
    -------
    a new :class:`DataFrame` that represents the stratified sample
    
    Examples
    --------
    >>> from pyspark.sql.functions import col
    >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))
    >>> sampled = 

## Stratified sampling

In [32]:
# 4.0   Take a stratified sample from DataFrame:
#              80% of 0s and 10% of 1s
#              We have to decide how stratification will be done.
# big.show(4)
sample = big.sampleBy(
                      "married",      # column that defines strata
                      fractions = {0 : 0.8, 1 : 0.1 })   # sampling fraction for each stratum
type(sample)

# 4.1
# big.count()
# sample.count()

# 4.2
sample.groupby('married')   # Same as value_counts() in pandas
big.groupby('married')

<pyspark.sql.group.GroupedData at 0x1f9bd9ffeb0>

In [33]:
# 5.0
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path: Union[str, List[str]], schema: Union[pyspark.sql.types.StructType, str, NoneType] = None, sep: Union[str, NoneType] = None, encoding: Union[str, NoneType] = None, quote: Union[str, NoneType] = None, escape: Union[str, NoneType] = None, comment: Union[str, NoneType] = None, header: Union[bool, str, NoneType] = None, inferSchema: Union[bool, str, NoneType] = None, ignoreLeadingWhiteSpace: Union[bool, str, NoneType] = None, ignoreTrailingWhiteSpace: Union[bool, str, NoneType] = None, nullValue: Union[str, NoneType] = None, nanValue: Union[str, NoneType] = None, positiveInf: Union[str, NoneType] = None, negativeInf: Union[str, NoneType] = None, dateFormat: Union[str, NoneType] = None, timestampFormat: Union[str, NoneType] = None, maxColumns: Union[str, int, NoneType] = None, maxCharsPerColumn: Union[str, int, NoneType] = None, maxMalformedLogPerPartition: Union[str, int, NoneType] = None, mode: Union[str, NoneType] = None, col

### From a csv file

In [None]:
###### B. Read Dataset from hadoop
# 5.1 Read 'airports.csv file into spark from hadoop

# Where is my file? Path:
URL_of_file= "hdfs://localhost:9000/user/ashok/data_files/nycflights/"

# 5.2 Takes time. We use 'spark' session object to read file:
airports_df = spark.read.csv(
                             path = URL_of_file + "airports.csv.gz",
                             inferSchema = True,      # Default: False
                             header = True,           # Default: False
                             sep = ",",               # Default: ","
                             ignoreLeadingWhiteSpace = True,  # Default: False
                             ignoreTrailingWhiteSpace = True  # Default: False
    
                            )

In [None]:
# 5.3 Show top-5 rows
# 5.3.1
airports_df.head(3)         #    List of three 'Row' instances. 
airports_df.take(3)         #    Result same as above but prefer take() to head()

In [None]:
# 5.3.2 USe collect()
#       collect() takes no number and returns all rows
airports_df.collect()       

In [None]:
# 5.3.3
type(airports_df.head(4))               # List
# 5.3.4
type(airports_df.head(4)[0])            # Row


In [None]:
# 5.4 Better display in columnar format

airports_df.show(5)                     # Tabular format
airports_df.show(5, False)              # Tabular format with extended columns


In [None]:
# 6.0 Have a look at the dataframe schema,
#     i.e. the structure of the DataFrame

airports_df.printSchema()


In [None]:
# 6.1 Column names
airports_df.columns

In [None]:
# 6.2 How many rows:
airports_df.count()          # 1397

# 6.3 How many columns?
len(airports_df.columns)     # 7


In [None]:
# 6.4 Full data summary
airports_df.describe().show()
# 6.4.1 Describe a particular column
airports_df.describe('dst').show(5)

## Using verbs
select, <i>select(x).where()</i>, <i>select().distinct()</i>, filter, groupby

In [34]:
###### C. Using Verbs
#         Select, filter, groupby, distinct, count()
#         It is more like 'dplyr' syntax, only that instead
#         of %>%, we have dot (.) .


# 7  Selecting Single/Multiple Columns
#    This does not work:   airports_df['faa'].show(5)
#    Following also does not work. Limitations
#    come because data is divided into chunks in various
#    servers. You have to use select clause.

airports_df[:2, :].show() 

NameError: name 'airports_df' is not defined

### select syntax
> DataFrame.select(\*cols)<br>
> cols: column names (in string format) seperated by comma or expressions (Column).<br>
> If a list of cols is input preceded ‘*’, that list is unlisted 


In [35]:
# 7.1
airports_df.select('faa').show(3)

# 7.2 Both the following work:
airports_df.select('faa','dst').show(3)
airports_df.select(*['faa','dst']).show(3)        # unlist the list
airports_df.select(['faa','dst']).show(3)         # List of columns

NameError: name 'airports_df' is not defined

In [36]:
big.select("married")

DataFrame[married: double]

In [None]:
# 7.3 Spark SQL doesn't support and it is unlikely to ever support
#     row indexing so it is not possible to index across row dimension.
#     Column indexing is possible, as below:

airports_df.select(airports_df.columns[0:2]).show(3,False)

In [None]:
# 7.4 Selecting Distinct values in Multiple Columns

airports_df.select('dst','tz').show()                 # Many repeating records
airports_df.select('dst','tz').distinct().show()      # Unique records

airports_df.select('dst','tz').count()                # 1397
airports_df.select('dst','tz').distinct().count()     # 22


### filter syntax
>DataFrame.filter(condition)<br>
>condition: <i>columnObject > 34</i> or string format: <i>"age > 34"</i>
>>  df.age > 3 or col("age") > 3<br>
>>  "age > 3" <br>
>>Logical Operators<br>
>>> If string: AND OR NOT<br>
>>> If columnObject: &, |, ~ <br>

In [None]:
# 7.5 Like operator in where clause.
#        'where'  is an alias for 'filter' (see below)

airports_df.select('name'). \
            where("name like '%La%'").\
            show(3,False)

In [None]:
# 8. Filtering data
# http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.filter
#  We use the filter command to filter our DataFrame based
#  on the condition that tz must be equal to -5 and then
#  we are calculating how many records/rows are there in
#  the filtered output.
#  Syntax: filter(condition) 

# 8.1  df.tz
print("using filter")
airports_df.filter(airports_df.tz == -5) .show(3,False)

# 8.2 select as filter
print("select as filter")
airports_df.select(
                    airports_df.tz == -5
                  ).show(3)

In [None]:
# 8.3 use of alias
airports_df.select(
                    (airports_df.tz == -5).alias('tz')
                  ).show(3)


# 8.4 df["tz"]
airports_df.filter(
                    airports_df["tz"] == -5
                  ) .show(3)

In [None]:
# 8.5 String has condition
airports_df.filter("tz == -5") .show(3)    # String is condition

# 8.6 Use of col() function
from pyspark.sql.functions import col
airports_df.filter(col("tz") == -5) .show(3)



In [None]:
## Use of Column.API 
######################

# 8.7 Use of isin() function. 
#     It is difficult to use isin() within
#     a string-condition because of list-object
#     Syntax: Column.isin(*cols)

airports_df.select("name"). \
            where(airports_df.name.isin(['Lansdowne Airport', 'Randall Airport']))

# 8.8 Use of %like%
#     Syntax: Column.like(other)
#             other: SQL like expression         
airports_df.select(airports_df.columns[:2]). \
            where("name like '%La%'"). \
            show(3)

# 8.9 Note like() function
airports_df.select(airports_df.columns[:2]). \
            where(airports_df.name.like('%La%')). \
            show(3)

### rlike
> syntax: Column.rlike(other)<br>
> other : an extended regex expression<br>
> Example: rlike('ice$')

In [None]:
# 9.0 Filtering with regular expressions

# 9.1 Filter with regular expressions:
airports_df.select('name'). \
            where(" name rlike  'pal$' " ). \
            show(3,truncate = False) 

# 9.2  where is an alias for filter  
airports_df.select('name'). \
            filter(" name rlike  'pal$' " ). \
            show(3,truncate = False) 

# 9.3    
airports_df.filter(" name rlike  'pal$' " ). \
            show(3,truncate = False) 


In [None]:
# 10. Combining verbs: select, filter and distinct

airports_df.select('dst', 'tz'). \
            filter(airports_df.tz == -5). \
            show(3)

# 10.1
airports_df.select('dst', 'tz'). \
            filter(airports_df.tz == -5). \
            distinct(). \
            show(3)

### Multiple conditions
<u>Logical operators</u>:<br>
> Between ColObjects: &, |, ==, ~<br>
> Within Strings: AND, OR, NOT, ==

In [None]:
# 11  Filtering logical operatotrs: &, '|', ==
# 11.1 We can filter our data based on multiple conditions (AND or OR)
#     Logical Operators: & ==and,   | == or   ~ == not

airports_df.filter(                         \
                   (airports_df.tz == -5) &  \
                   (airports_df.dst=="A") | \
                   (airports_df.name.like('%Lans%')) \
                  ). show(3)   


In [None]:
# 11.2 Conditions within strings:

airports_df.filter(                         \
                   "(tz == -5) AND  \
                    (dst== 'A') OR \
                    (name  like '%Lans%' )" \
                  ). show(3)

## Aggregation with groupby
Use: <i>.agg({'colName1' : 'mean', 'colName2' : 'sum'})</i> <br>
>1. With <i>agg()</i> if you are using dictionary one can then use only builtin functions and not any other <i>pyspark.sql.function</i>.<br>
Some common functions are: <i>mean, avg, sum, count, first, last,stddev </i>. There is no need to import builtin function in advance.<br>
For a complete list of builtin functions see [here](https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/).<br>
2. You can also use a pyspark.sql.function, as: .agg(sum(df.age), min(col("purchase")) )<br> 


In [38]:
help(big.groupBy)

Help on method groupBy in module pyspark.sql.dataframe:

groupBy(*cols: 'ColumnOrName') -> 'GroupedData' method of pyspark.sql.dataframe.DataFrame instance
    Groups the :class:`DataFrame` using the specified columns,
    so we can run aggregation on them. See :class:`GroupedData`
    for all the available aggregate functions.
    
    :func:`groupby` is an alias for :func:`groupBy`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : list, str or :class:`Column`
        columns to group by.
        Each element should be a column name (string) or an expression (:class:`Column`).
    
    Examples
    --------
    >>> df.groupBy().avg().collect()
    [Row(avg(age)=3.5)]
    >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(df.name).avg().collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(['name', df

In [None]:
# 12. groupby. Can apply sum, min, max, count

airports_df.groupby('tz'). \
           count(). \
           show(3)

# 12.1
airports_df.groupby('tz'). \
            agg({'lat' : 'mean'}). \
            show(3)

# 12.2
airports_df.groupby(['tz','dst']). \
            agg({'lat' : 'mean'}). \
            show(3)


In [None]:
"""
Unpacking operator in python (*) : 
Ref: https://codeyarns.com/2012/04/26/unpack-operator-in-python/ 

print(*[3,4])
print([3,4])

"""

In [None]:
# 7.1 One can take the average of columns by passing
#       an unpacked list of column names.

grObject = airports_df.groupby('tz')

avg_cols = ['lat', 'lon']
grObject.avg(*avg_cols).show(3)

In [None]:
# 7.2 To call multiple aggregation functions at once, pass a dictionary.
#         The 'key' of dictionary becomes argument to 'value'.
#                             count(*)        avg(lat)      sum(lon)

grObject.agg({'*': 'count', 'lat': 'avg', 'lon':'sum'}).show(2)

## Column manipulation
> DataFrame.withColumn(newColName, colExpression)<br>
> <u>colExpression</u>:  df.age +2 ; df.age * df.ht

In [None]:
# 8. Create new columns in Spark using .withColumn() --mutate
#      New column: altInThousands . 
#      Product of two columns:  'alt' and  'lon' 

airports_df.withColumn('altInThousands',                         # New column
                       airports_df.alt*airports_df.lon           # Values
                      ).show(3)


In [None]:
# 9. Save the new file with additional column in parquet form

xyz = airports_df.withColumn('altInThousands', airports_df.alt*airports_df.lon)
xyz.write.parquet("hdfs://localhost:9000/user/ashok/data_files/airports_extra.parquet")

In [None]:
# 9.1 Delete xyz from spark
import gc
del xyz
gc.collect()    # Delete all cache also

In [None]:
# 9.2 Read the stored parquet file
df = spark.read.parquet("hdfs://localhost:9000/user/ashok/data_files/airports_extra.parquet")
df.show(3)

## Joining tables

In [None]:
# 9.3 Read 'weather.csv file into spark from hadoop

URL_of_file= "hdfs://localhost:9000/user/ashok/data_files/nycflights/"
weather_df = spark.read.csv(path = URL_of_file + "weather.csv.gz",
                            inferSchema = True,
                            header = True
                           )
weather_df.show(3)

In [None]:
# 10. Joins
# Refer: http://www.learnbymarketing.com/1100/pyspark-joins-by-example/
# For example, I can join the two titanic dataframes by the column PassengerId

# 10.1
airports_df.join(                                       # Left dataset
                  weather_df,                           # Right dataset
                  airports_df.faa==weather_df.origin    # Join on
                ).show(3)

# 10.2
airports_df.join(
                  weather_df,
                  airports_df.faa==weather_df.origin,
                  how = 'inner'
                ).count()

# 10.3
airports_df.join(
                   weather_df,
                   airports_df.faa==weather_df.origin,
                   how = 'left'
                ).count()   # Could also use 'left_outer', 'right', 'full'

# 10.4
weather_df.join(
                   airports_df,
                   airports_df.faa==weather_df.origin,
                   how = 'left'
                ).count()   # Could also use 'left_outer', 'right', 'full'


## SQL queries against DataFrame

In [None]:
# 11. Many of the operations can be accessed by writing SQL queries in spark.sql().
# To make an existing Spark dataframe usable for spark.sql(), one needs to
#   register said dataframe as a temporary table.

# 11.1 As an example, we can register the two dataframes as temp tables then
#      join them through spark.sql().

airports_df.createOrReplaceTempView('dfa_temp')   # airports_df is registered as dfs_temp table
weather_df.createOrReplaceTempView('dfw_temp')    # weather_df is registered as dfw_temp table

In [None]:
# 11.2 Simple SQL query. SQLContext is no longer needed. 'spark'
#            session object can be used.

dfj = spark.sql('select * from dfa_temp' )
dfj.show(3)

In [None]:
# 11.3 Now the SQL join

dfj = spark.sql('select * from dfa_temp a, dfw_temp b where a.faa = b.origin' )
dfj.show(3)


## Misc

In [None]:
# 12. Drop a columns

airports_df.drop('name').show(3)

# 12.1  Or drop multiple columns

columns_to_drop = ['name', 'lat']
xx =airports_df.drop(*columns_to_drop)
xx.show(3)

In [None]:
########### I am done ####################