In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 66.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=151afa57799c070c8902f64a140fbe16f0971c15bda1146d81521950354c965d
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Pyspark Column Class').getOrCreate()

In [3]:
data=[("Jack",23),("Ema",40)]
df=spark.createDataFrame(data).toDF("name.first_name","age")

df.select(df.age).show()
df.select(df["age"]).show()
#Accessing column name with dot (with backticks)
df.select(df["`name.first_name`"]).show()

#Using SQL col() function
from pyspark.sql.functions import col
df.select(col("age")).show()
#Accessing column name with dot (with backticks)
df.select(col("`name.first_name`")).show()

+---+
|age|
+---+
| 23|
| 40|
+---+

+---+
|age|
+---+
| 23|
| 40|
+---+

+---------------+
|name.first_name|
+---------------+
|           Jack|
|            Ema|
+---------------+

+---+
|age|
+---+
| 23|
| 40|
+---+

+---------------+
|name.first_name|
+---------------+
|           Jack|
|            Ema|
+---------------+



In [5]:
#Creating dataframe with struct using Row class
from pyspark.sql import Row
data=[Row(name="James",prop=Row(hair="black",eye="blue")),
      Row(name="Ann",prop=Row(hair="grey",eye="black"))]
df=spark.createDataFrame(data)

#Access struct column
df.select(df.prop.eye).show()
df.select(df["prop.eye"]).show()
df.select(col("prop.eye")).show()

#Access all columns from struct
df.select(col("prop.*")).show()


+--------+
|prop.eye|
+--------+
|    blue|
|   black|
+--------+

+-----+
|  eye|
+-----+
| blue|
|black|
+-----+

+-----+
|  eye|
+-----+
| blue|
|black|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



In [7]:
data=[(100,2,1),(200,3,4),(300,4,4)]
df=spark.createDataFrame(data).toDF("c1","c2","c3")

df.select(df.c1 + df.c2).show()
df.select(df["c1"] - df.c2).show() 
df.select(df["c1"] * df["c2"]).show()
df.select(df.c1 / df.c2).show()
df.select(df.c1 % df.c2).show()

df.select(df.c2 > df.c3).show()
df.select(df.c2 < df.c3).show()
df.select(df.c2 == df.c3).show()


+---------+
|(c1 + c2)|
+---------+
|      102|
|      203|
|      304|
+---------+

+---------+
|(c1 - c2)|
+---------+
|       98|
|      197|
|      296|
+---------+

+---------+
|(c1 * c2)|
+---------+
|      200|
|      600|
|     1200|
+---------+

+-----------------+
|        (c1 / c2)|
+-----------------+
|             50.0|
|66.66666666666667|
|             75.0|
+-----------------+

+---------+
|(c1 % c2)|
+---------+
|        0|
|        2|
|        0|
+---------+

+---------+
|(c2 > c3)|
+---------+
|     true|
|    false|
|    false|
+---------+

+---------+
|(c2 < c3)|
+---------+
|    false|
|     true|
|    false|
+---------+

+---------+
|(c2 = c3)|
+---------+
|    false|
|    false|
|     true|
+---------+



In [8]:
# functions

data=[("J","B","100",None),
      ("A","V","200",'F'),
      ("T","XXX","400",''),
      ("W",None,"400",'M')] 
col=["f_name","l_name","id","gender"]
df=spark.createDataFrame(data,col)

In [10]:
# asc() & desc()
df.sort(df.f_name.asc()).show()
df.sort(df.f_name.desc()).show()

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     A|     V|200|     F|
|     J|     B|100|  null|
|     T|   XXX|400|      |
|     W|  null|400|     M|
+------+------+---+------+

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     W|  null|400|     M|
|     T|   XXX|400|      |
|     J|     B|100|  null|
|     A|     V|200|     F|
+------+------+---+------+



In [12]:
# cast() & astype() convert the data Type

In [11]:
df.select(df.id.cast("int")).printSchema()

root
 |-- id: integer (nullable = true)



In [15]:
# between() – Returns a Boolean expression when a column values in between lower and upper bound.
df.filter(df.id.between(100,150)).show()

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     J|     B|100|  null|
+------+------+---+------+



In [20]:
# contains() – Checks if a DataFrame column value contains a value specified in this function.
df.filter(df.f_name.contains("J")).show()

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     J|     B|100|  null|
+------+------+---+------+



In [21]:
#startswith, endswith()
df.filter(df.gender.startswith("None")).show()
df.filter(df.gender.endswith("M")).show()

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
+------+------+---+------+

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     W|  null|400|     M|
+------+------+---+------+



In [22]:
#isNull & isNotNull
df.filter(df.gender.isNull()).show()
df.filter(df.gender.isNotNull()).show()


+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     J|     B|100|  null|
+------+------+---+------+

+------+------+---+------+
|f_name|l_name| id|gender|
+------+------+---+------+
|     A|     V|200|     F|
|     T|   XXX|400|      |
|     W|  null|400|     M|
+------+------+---+------+

