In [1]:
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcon                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
                                                                                                    Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubunt

In [16]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

# Initialize findspark
import findspark
findspark.init()

# Create a PySpark session
from pyspark.sql import SparkSession
#spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder.appName("Read and Write Data Using PySpark").getOrCreate()
spark

In [4]:
# How to convert the index of a PySpark DataFrame into a column?
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

# Define window specification
w = Window.orderBy(monotonically_increasing_id())
# Add index
df = df.withColumn("index", row_number().over(w) - 1)
df.show()

+-------+-----+-----+
|   Name|Value|index|
+-------+-----+-----+
|  Alice|    1|    0|
|    Bob|    2|    1|
|Charlie|    3|    2|
+-------+-----+-----+



Reading DataFrames, Reading and writing files

In [6]:
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [7]:
#--reading a CSV file
csv_file = "/content/sample_data/california_housing_train.csv"
df_csv = spark.read.csv(csv_file, header=True, inferSchema=True)
#--writing to a CSV file
df_csv.write.csv("chtw.csv", header = True, mode = "overwrite")

Reading and writing to json files

In [10]:
json_file = "/content/sample_data/anscombe.json"
#--reading
df_json = spark.read.json(json_file)
#--writing
df_json.write.json("a.json", mode = "overwrite")

Creating an SQL table in pyspark

In [11]:
data = [
    {"name": "Alice", "age": 30, "city": "New York"},
    {"name": "Bob", "age": 25, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Los Angeles"}
]
df_sql = spark.createDataFrame(data)

In [13]:
df_sql.createOrReplaceTempView("people")
qry = "select * from people where age >= 25"
odf = spark.sql(qry)
odf.show()

+---+-------------+-------+
|age|         city|   name|
+---+-------------+-------+
| 30|     New York|  Alice|
| 25|San Francisco|    Bob|
| 35|  Los Angeles|Charlie|
+---+-------------+-------+



Converting pandas DataFrame to pyspark DataFrame

In [17]:
import pandas as pd
data = [
    {"name": "Alice", "age": 30, "city": "New York"},
    {"name": "Bob", "age": 25, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Los Angeles"}
]
pandasDF = pd.DataFrame(data, columns = ['name', 'age', 'city'])
print(pandasDF)

      name  age           city
0    Alice   30       New York
1      Bob   25  San Francisco
2  Charlie   35    Los Angeles


In [19]:
sparkDF = spark.createDataFrame(pandasDF)
sparkDF.printSchema()
sparkDF.show()

  for column, series in pdf.iteritems():


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|  Alice| 30|     New York|
|    Bob| 25|San Francisco|
|Charlie| 35|  Los Angeles|
+-------+---+-------------+

