# PySpark Environment
Set up the PySpark environment, including importing necessary libraries and initializing a Spark session.

In [1]:
import os

os.environ['SPARK_HOME'] = 'C:/Users/saulr/anaconda3/envs/pyspark-env/Lib/site-packages/pyspark'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
# Import PySpark and initialize SparkSession
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName('SparkDataFrame').getOrCreate()

# Read CSV file into DataFrame

## Read CSV with header

In [None]:
# Read the CSV file into a DataFrame
df = spark.read.csv('../data/products.csv', header=True, inferSchema=True)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Read CSV with an explicit schema definition

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Define the schema explicitly
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True)
])

# Read the CSV file into a DataFrame with the explicit schema
df = spark.read.csv('../data/products.csv', header=True, schema=schema)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

# Read JSON File into DataFrame

## Single line JSON

In [None]:
# Read the single-line JSON file into a DataFrame
df = spark.read.json('../data/products_singleline.json')

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Multi-lines JSON

In [None]:
# Read the multi-line JSON file into a DataFrame
df = spark.read.json('../data/products_multiline.json', multiLine=True)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()