In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\krish\anaconda3\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\krish\anaconda3\python.exe"

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [3]:
spark = SparkSession.builder \
                    .appName("DataFrame_Operations") \
                    .getOrCreate()

In [4]:
data = [
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35)
]

schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False)
])

In [5]:
df = spark.createDataFrame(data, schema)

In [6]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [7]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [8]:
df.columns

['id', 'name', 'age']

In [9]:
df.describe().show()

+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   NULL|30.0|
| stddev|1.0|   NULL| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+



In [10]:
# This is a transformation and we'll not see data
df.select('name', 'age')

DataFrame[name: string, age: int]

In [11]:
# This is an action and we'll see data
df.select('name', 'age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



### Filter Operation

In [12]:
df.filter(df.age > 25).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [13]:
df.filter(df.name.startswith('A')).show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



### _Where_ function is just an alias of the filter function

In [14]:
df.where(df.id == 3).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
+---+-------+---+



### Distinct Function

In [15]:
df.distinct().show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



### Sorting and Ordering

In [16]:
# Ascending order
df.orderBy('age').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [17]:
# Desccending order
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  2|    Bob| 30|
|  1|  Alice| 25|
+---+-------+---+



### Adding and dropping columns

In [18]:
df = df.withColumn('age_to_retirement', 60 - df.age)

In [19]:
df.show()

+---+-------+---+-----------------+
| id|   name|age|age_to_retirement|
+---+-------+---+-----------------+
|  1|  Alice| 25|               35|
|  2|    Bob| 30|               30|
|  3|Charlie| 35|               25|
+---+-------+---+-----------------+



### Drop Function

In [20]:
df = df.drop('age')

In [21]:
df.show()

+---+-------+-----------------+
| id|   name|age_to_retirement|
+---+-------+-----------------+
|  1|  Alice|               35|
|  2|    Bob|               30|
|  3|Charlie|               25|
+---+-------+-----------------+



### Grouping and Aggregation

In [22]:
df.groupBy('name').count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|    Bob|    1|
|Charlie|    1|
+-------+-----+



In [23]:
df.agg({'age_to_retirement': 'avg'}).show()

+----------------------+
|avg(age_to_retirement)|
+----------------------+
|                  30.0|
+----------------------+



### Joins

In [24]:
data2 = [
  (1, 'India'),
  (2, 'Italy'),
  (3, 'France')
]

schema2 = StructType([
  StructField('id', IntegerType(), True),
  StructField('Country', StringType(), True)
])

In [25]:
df2 = spark.createDataFrame(data2, schema2)

In [26]:
df2.show()

+---+-------+
| id|Country|
+---+-------+
|  1|  India|
|  2|  Italy|
|  3| France|
+---+-------+



In [27]:
df2.join(df, on='id', how='inner').show()

+---+-------+-------+-----------------+
| id|Country|   name|age_to_retirement|
+---+-------+-------+-----------------+
|  1|  India|  Alice|               35|
|  2|  Italy|    Bob|               30|
|  3| France|Charlie|               25|
+---+-------+-------+-----------------+



In [28]:
spark.stop()