In [None]:
# pip install pyspark
# pip install findspark 
# pip show pyspark

In [None]:
#import findspark
#findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("lab").getOrCreate()

In [None]:
# To Spark (in Python or R), there is no such thing as a Dataset. Everything is a DataFrame 
spark.range(2)

In [None]:
spark.range(2).collect()

In [None]:
df = spark.read \
        .format("json") \
        .load("/Applications/MAMP/htdocs/Spark-The-Definitive-Guide/data/flight-data/json/2015-summary.json")

In [None]:
df

In [None]:
df.schema

In [None]:
df.printSchema()

In [None]:
listOfRowObjects = df.collect()

In [None]:
type(listOfRowObjects)

In [None]:
len(listOfRowObjects)

In [None]:
type(listOfRowObjects[0])

In [None]:
df = spark.createDataFrame([(1, (2,2))], ["a", "b"])
df.show()
df.printSchema(1)
df.printSchema(2)

In [None]:
df

In [None]:
df.schema

In [None]:
from pyspark.sql.functions import col,column,expr

In [None]:
df.a

In [None]:
df['a']

In [None]:
df.select(df.a).show()

In [None]:
df.select(df['a']).show()

In [None]:
df.select(col('a')).show()

In [None]:
# An expression created via the `expr` function is just a DataFrame column reference
# In the simplest case, expr("someCol") is equivalent to col("someCol")
expr('a')

In [None]:
# All the below are same! 
print(expr('a - 5'))
print(col('a') - 5)
print(expr('a') -5)

In [None]:
df.columns

### Records and Rows

In [None]:
df.first()

### Creating Rows

In [None]:
from pyspark.sql import Row 
myRow = Row("Hello", None, 1, False)

In [None]:
myRow

In [None]:
df.createOrReplaceTempView("dfTable")

In [None]:
spark.sql("select * from dfTable").show(10)

### Creating DataFrames

In [None]:
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType,LongType
myManualSchema = StructType([
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('age', LongType(), True)
])

alice = Row("Alice", "Henderson", 25)
bob = Row("Bob", "Sanders", 28)
spark.createDataFrame([alice, bob], myManualSchema).show()

In [None]:
# pip install sparksql-magic
# Load the extension
%load_ext sparksql_magic

In [None]:
%%sparksql
select * from dfTable limit 250

### select and selectExpr

In [None]:
df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').show(2, False)

In [None]:
from pyspark.sql.functions import expr, col, column
df.select(
    expr('DEST_COUNTRY_NAME'),
    col('DEST_COUNTRY_NAME'),
    column('DEST_COUNTRY_NAME')).show(2)

In [None]:
df.select(col('DEST_COUNTRY_NAME'), 'DEST_COUNTRY_NAME')

In [None]:
df.select(expr('DEST_COUNTRY_NAME as destination')).show(2)

In [None]:
df.select(expr('DEST_COUNTRY_NAME as destination').alias('alias_destination')).show(2)

In [None]:
df.selectExpr('DEST_COUNTRY_NAME as destination', 'DEST_COUNTRY_NAME').show(2)

In [None]:
df.selectExpr('*', '(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withInCountry').show(10)

In [None]:
# Aggregation over the entire DataFrame
df.selectExpr('avg(count)','count(distinct(DEST_COUNTRY_NAME))').show(2)

### Converting to Spark Types (Literals)

In [None]:
from pyspark.sql.functions import lit
df.select('*',lit(1).alias('one')).show(2)

In [None]:
df.select(expr('*'),lit(1).alias('one')).show(2)

### Adding Columns

In [None]:
df.withColumn('one', lit(1)).show(2)

In [None]:
# Another way to add a column!
df.withColumn('withInCountry', expr('DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME')).show(2)

In [None]:
# Another way to rename a column!
df.withColumn('destination',expr('DEST_COUNTRY_NAME')).show(2)

In [None]:
df.withColumnRenamed('DEST_COUNTRY_NAME','destination').columns

In [None]:
dfWithLongColName = df.withColumn('Col Name With Space',expr('DEST_COUNTRY_NAME'))

In [None]:
dfWithLongColName.show(2)

In [None]:
dfWithLongColName.selectExpr('`Col Name With Space`', '`Col Name With Space` as `new col`').show(2)

In [None]:
dfWithLongColName.select(col('Col Name With Space')).show(2)

In [None]:
# We need to escape expressions that use reserved characters/keywords
dfWithLongColName.select(expr('`Col Name With Space`')).show(2)

In [None]:
df.printSchema

In [None]:
df.withColumn('count2', col('count').cast('int')).printSchema

In [None]:
# df.filter(col('count') < 2).show(2)
# df.filter(expr('count') < 2).show(2)
df.filter(expr('count < 2')).show(2)

In [None]:
df.where('count < 2').show(2)

In [None]:
df.where(col('count') < 2).show(2)

In [None]:
df.where(col('count') < 2)\
    .where("ORIGIN_COUNTRY_NAME != 'Croatia'")\
    .show(2)

In [None]:
df.where(col('count') < 2).where(col('ORIGIN_COUNTRY_NAME') != 'Croatia').show(2)

In [None]:
df.select('DEST_COUNTRY_NAME','ORIGIN_COUNTRY_NAME').distinct().count()

In [None]:
df.select('ORIGIN_COUNTRY_NAME').distinct().count()

### Random Samples

In [None]:
# sampling without replacement, in which a subset of the observations is selected randomly, and once an observation is selected it cannot be selected again. 
# sampling with replacement, in which a subset of observations are selected randomly, and an observation may be selected more than once.
df.sample(False,.9).count()

### Concatenating and Appending Rows (Union)

In [None]:
from pyspark.sql import Row

lastRow = Row('New Country', 'New Country', 1)
newDF = spark.createDataFrame([lastRow], df.schema)
df.union(newDF).where('DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME').show(300)

### Sorting Rows

In [None]:
df.sort('count').show(5)

In [None]:
df.orderBy('count','DEST_COUNTRY_NAME').show(5)

In [None]:
df.orderBy(col('count'), col('DEST_COUNTRY_NAME')).show(5)

In [None]:
df.orderBy(expr('count desc')).show(5)

In [None]:
df.orderBy(col('count').desc(), col('DEST_COUNTRY_NAME').asc()).show(5)

In [None]:
# For optimization purposes, it's sometimes advisable to sort within each partition before
# another set of transformations
spark.read \
        .format("json") \
        .load("/Applications/MAMP/htdocs/Spark-The-Definitive-Guide/data/flight-data/json/*-summary.json")\
        .sortWithinPartitions('count')

### Repartition & Coalesce
#### https://mrpowers.medium.com/managing-spark-partitions-with-coalesce-and-repartition-4050c57ad5c4

In [None]:
df.rdd.getNumPartitions()

In [None]:
df.repartition(5)

In [None]:
rangeDF=spark.range(10)

In [None]:
rangeDF.rdd.getNumPartitions()

In [None]:
rangeDF.write.csv('/Users/deepakagrawal/Desktop/data', 'overwrite')

In [None]:
increasedPartitionedDF=rangeDF.repartition(6)

In [None]:
increasedPartitionedDF.write.csv('/Users/deepakagrawal/Desktop/data', 'overwrite')