In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName('Spark Operations').getOrCreate()

25/06/09 15:53:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [13]:
#Sample data
data = [
    (1,"Alice",25),
    (2,"Bob",30),
    (3,"Chrlett",27) 
]

from pyspark.sql.types import *

In [4]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("age",IntegerType(),False)
])

df = spark.createDataFrame(data,schema)

df.show()

                                                                                

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|Alice| 25|
|  3|Alice| 25|
+---+-----+---+



In [5]:
df.columns

['id', 'name', 'age']

In [6]:
df.describe().show()



+-------+---+-----+----+
|summary| id| name| age|
+-------+---+-----+----+
|  count|  3|    3|   3|
|   mean|2.0| NULL|25.0|
| stddev|1.0| NULL| 0.0|
|    min|  1|Alice|  25|
|    max|  3|Alice|  25|
+-------+---+-----+----+



                                                                                

In [7]:
df.select('name','age')

DataFrame[name: string, age: int]

In [8]:
df.select('name','age').show()

[Stage 5:>                                                          (0 + 1) / 1]

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|Alice| 25|
|Alice| 25|
+-----+---+



                                                                                

In [9]:
df.filter(df.age>25).show()

+---+----+---+
| id|name|age|
+---+----+---+
+---+----+---+



In [10]:
df.where(df.name == 'Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  2|Alice| 25|
|  3|Alice| 25|
+---+-----+---+



In [12]:
df.distinct().show()



+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
|  3|Alice| 25|
|  2|Alice| 25|
+---+-----+---+



                                                                                

In [14]:
df_2 = spark.createDataFrame(data,schema)
df_2.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Chrlett| 27|
+---+-------+---+



In [16]:
df_2.orderBy('age').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  3|Chrlett| 27|
|  2|    Bob| 30|
+---+-------+---+



In [17]:
df_2.orderBy(df_2.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Chrlett| 27|
|  1|  Alice| 25|
+---+-------+---+



In [18]:
df = df_2

In [21]:
df = df.withColumn('new_age',df.age+5)

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Chrlett| 27|     32|
+---+-------+---+-------+



In [23]:
df = spark.createDataFrame(data,schema)

In [24]:
df = df.withColumn('new_age',df.age+5).show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Chrlett| 27|     32|
+---+-------+---+-------+



In [25]:
df.show()

AttributeError: 'NoneType' object has no attribute 'show'

In [26]:
df = spark.createDataFrame(data,schema)
df = df.withColumn('new_age',df.age+5)
df.show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Chrlett| 27|     32|
+---+-------+---+-------+



### Aggregation and grouping

In [27]:
df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|    Bob|    1|
|Chrlett|    1|
+-------+-----+



In [28]:
df.agg({'age':'avg'}).show()

+------------------+
|          avg(age)|
+------------------+
|27.333333333333332|
+------------------+



### Joins

In [29]:
data_2 = [(1,'usa'),(2,'uk'),(3,'India')]
schema2 = StructType([
    StructField('id',IntegerType(),False),
    StructField('country',StringType(),False)
])

In [30]:
df2 = spark.createDataFrame(data_2,schema2)

In [31]:
df2.show()

+---+-------+
| id|country|
+---+-------+
|  1|    usa|
|  2|     uk|
|  3|  India|
+---+-------+



In [32]:
df.show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     30|
|  2|    Bob| 30|     35|
|  3|Chrlett| 27|     32|
+---+-------+---+-------+



In [33]:
df.join(df2,'id').show()

+---+-------+---+-------+-------+
| id|   name|age|new_age|country|
+---+-------+---+-------+-------+
|  1|  Alice| 25|     30|    usa|
|  2|    Bob| 30|     35|     uk|
|  3|Chrlett| 27|     32|  India|
+---+-------+---+-------+-------+



In [34]:
final_df = df.join(df_2,'id')
final_df.show()

+---+-------+---+-------+-------+---+
| id|   name|age|new_age|   name|age|
+---+-------+---+-------+-------+---+
|  1|  Alice| 25|     30|  Alice| 25|
|  2|    Bob| 30|     35|    Bob| 30|
|  3|Chrlett| 27|     32|Chrlett| 27|
+---+-------+---+-------+-------+---+



In [None]:
spark.stop()