In [1]:
!pip install pyspark -q

In [2]:
from google.colab import files

uploaded = files.upload()  # Upload students.csv

Saving students.csv to students.csv


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, round, max

# Initialize Spark session
spark = SparkSession.builder.appName("StudentsDataFrameExample").getOrCreate()

In [4]:
df = spark.read.csv("students.csv", header=True, inferSchema=True)

In [5]:
print("=== First 10 rows ===")
df.show(10)

=== First 10 rows ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
|  4|  David| 19|     M|  95|     69|     46|
|  5|    Eva| 19|     F|  62|     44|     96|
|  6|  Frank| 22|     F|  70|     78|     94|
|  7|  Grace| 24|     F|  67|     66|     93|
|  8|  Henry| 21|     F|  53|     82|     60|
|  9|    Ivy| 19|     M|  64|     52|     46|
| 10|   Jack| 19|     F|  44|     59|     60|
+---+-------+---+------+----+-------+-------+
only showing top 10 rows



In [6]:
print("=== Schema ===")
df.printSchema()

=== Schema ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)



In [7]:
print("=== Datatypes ===")
print(df.dtypes)

=== Datatypes ===
[('id', 'int'), ('name', 'string'), ('age', 'int'), ('gender', 'string'), ('math', 'int'), ('science', 'int'), ('english', 'int')]


In [8]:
print("=== Summary statistics ===")
df.describe().show()

=== Summary statistics ===
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|summary|                id| name|               age|gender|              math|           science|          english|
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|  count|                50|   50|                50|    50|                50|                50|               50|
|   mean|              25.5| NULL|              21.5|  NULL|             68.94|             70.16|            69.36|
| stddev|14.577379737113251| NULL|2.2337851101588404|  NULL|17.609610085034216|14.636214521186957|18.74507826560544|
|    min|                 1|Aaron|                18|     F|                40|                44|               42|
|    max|                50| Zoey|                25|     M|               100|                99|              100|
+-------+------------------+-----+---

In [9]:
print("Total rows:", df.count())
print("Columns:", df.columns)

Total rows: 50
Columns: ['id', 'name', 'age', 'gender', 'math', 'science', 'english']


In [10]:
print("\n=== Select name, age, and math columns ===")
df.select("name", "age", "math").show(10)


=== Select name, age, and math columns ===
+-------+---+----+
|   name|age|math|
+-------+---+----+
|  Alice| 20|  66|
|    Bob| 20|  82|
|Charlie| 22|  43|
|  David| 19|  95|
|    Eva| 19|  62|
|  Frank| 22|  70|
|  Grace| 24|  67|
|  Henry| 21|  53|
|    Ivy| 19|  64|
|   Jack| 19|  44|
+-------+---+----+
only showing top 10 rows

