In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql.functions import  *
from datetime import date
from pyspark.sql.types import * 
from pyspark.sql import Window
from datetime import datetime

In [2]:
spark = (
    SparkSession.builder
    .master("local")
    .appName("Exploring Joins")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)
sc = spark.sparkContext

In [3]:
data = spark.read.csv('pets.csv',header=True)

### Looking at your Data

In [4]:
# First method is collect method
data.collect()

In [5]:
# Second method is toPandas method
data.toPandas()

In [6]:
# Third method is head method
data.head(n=3)

In [7]:
data.show(n=3)

### Selecting a Subset of Columns

In [8]:
data.select("id","breed_id","nickname","age","weight").toPandas()

In [9]:
data.drop("birthday","color").toPandas()

In [10]:
data.withColumn("n-nickname_copy",col("nickname")).\
    withColumn('nickname_capatilize',upper(col('nickname'))).toPandas()

In [11]:
data.withColumnRenamed("id","pet_id").toPandas()

### Constant Value and Column Expression

In [12]:
# what if we want to insert constant value to the new column

data.withColumn("today-date",date.today()).toPandas()

In [13]:
'''
Spark functions that have a col as an argument will usually require you to pass in a Column expression. As seen in the previous section, withColumn() worked fine when we gave it a column from the current df. But this isn't the case when we want set a column to a constant value.
'''
data.withColumn("today_date",lit(date.today())).show()

In [14]:
data.withColumn('height',lit(10)).show()

In [15]:
data.withColumn('double_age',col('age')*2).show()

#### Casting_Column_to_Different_DataTypes

In [16]:
data.printSchema()

In [17]:
data.select('birthday').withColumn("birthday_date",col('birthday').cast('date')).withColumn("birthday_date2",col('birthday').cast(DateType())).show()

In [18]:
data.select('birthday').withColumn("birthday",col('birthday').cast('date')).printSchema()

### Filtering Data Where,Filter and is_in.

In [19]:
data.select("age","nickname").where(col("age")>3).show()

In [20]:
data.where(col("breed_id")==2).show()

In [21]:
data.filter(col('breed_id')==2).show()

In [22]:
data.filter(col('breed_id').isin(2,3)).show()

In [23]:
data.where(col('nickname').isin("King","Argus")).show()

### Equality Statememt in PySpark

In [24]:
data.where((col('breed_id')>=lit(1)) & (col('breed_id')<lit(3))).show()

In [25]:
data.filter((col('breed_id')>=1) & (col('breed_id')<=3)).show()

In [26]:
data.filter(col('breed_id').isin(2,3)).show()

In [27]:
data.filter((col("breed_id").isin(1,2)) & (col('nickname').isNotNull())|col('color').isin("white")).show()

In [28]:
data.withColumn("result",col("color")!="white").\
    withColumn("result2",(col('color')!="white")& (col('color').isNotNull())).show()

### Case Statements

In [29]:
data.withColumn('oldness_value',when(col('age')<=5,"young").when((col('age')>=5) & (col('age')<=13),"middle_age").otherwise("old")).show()

In [30]:
data.withColumn('race',when(col('color')=='brown',"black_dog").when(col('color')=='white','white_dog').otherwise('black_white_dog')).show()

In [31]:
data.select("nickname","weight").withColumn("weight_value",when(col("weight")<5,"under_weight").when((col('weight')>5) & (col('weight')<=10),"normal_weight").\
                                   otherwise("over_weight")).show()

### Fill nan and null values

In [32]:
data.fillna("Ngawang").toPandas()

In [33]:
'''
You have the option of filling in each column with a diffferent value. This provides more flexibility as most times the columns will be different types and a single deafult value won't be sufficient enough.
'''
data.fillna({"nickname":"Ngawang","color":"Anurag"}).show()

In [34]:
'''
Another way to fill in a column with values is using coalesce(). This function will try to fill in the specified columns by looking at the given arguments in order from left to right, until one of the arguments is not null and use that. If all else fails, you can provide a "default" value as your last arugment (remembering that it should be a columnar expression).
'''
data.withColumn('xyz',coalesce(col('nickname'),col('color'),lit('default'))).show()

### User_Defined_Function

In [35]:
from pyspark.sql.functions import udf

@udf('string')
def uppercase(word):
    return word.upper()[:2] if word else None

data.withColumn('uppercase',uppercase(col('nickname'))).show()



In [36]:
data.toPandas()

### Aggregation Function

In [37]:
data.groupBy("breed_id").agg({"*":"count","age":"avg","weight":"avg"}).show()

In [38]:
data.groupBy("breed_id").agg(count("age").alias("count_breed"),sum("age").alias("age_sum"),avg("age").alias("age_avg")).show()

In [39]:
data.groupby("breed_id").agg(count("*").alias("count_breed"),sum("weight").alias("weight_sum"),avg("weight").alias("average_weight")).show()

### Non Deterministic Ordering for GroupBys

In [40]:
data.printSchema()

In [41]:
data = data.withColumn("birthday",col("birthday").cast("date"))

In [42]:
data.show()

In [43]:
data.orderBy("birthday").groupBy('breed_id').agg(first('nickname').alias('first_breed')).toPandas()

In [44]:
data.orderBy("birthday").groupBy('breed_id').agg(first('nickname').alias('first_breed')).toPandas()

In [45]:
window = Window.partitionBy("breed_id").orderBy("birthday")

In [46]:
data.withColumn('row_number',row_number().over(window)).show()

In [47]:
pets = spark.createDataFrame(
    [
        (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 'Bear', 5),
        (2, 1, datetime(2010, 1, 1, 1 ,1, 1), 'Chewie', 15),
        (3, 1, datetime(2015, 1, 1, 1 ,1, 1), 'Roger', 10),
        (4, 2, datetime(2015, 1, 2, 2 ,3, 4), 'Roger', 10),
        (5, 2, datetime(2015, 5, 6, 7 ,8, 9), 'Roger', 10),
    ], ['id', 'breed_id', 'birthday', 'nickname', 'age']
)
pets.toPandas()

In [48]:
windows = Window.partitionBy("breed_id").orderBy("age")

In [49]:
pets.withColumn('row_number',row_number().over(windows)).show()

In [50]:
windows2 = Window.partitionBy("breed_id").orderBy("id")

In [53]:
pets.withColumn('sum_age',sum("age").over(windows2)).show()

In [54]:
windows3 = Window.partitionBy("breed_id")
pets.withColumn('sum_all',sum("age").over(windows3)).show()