# PySpark Environment
Set up the PySpark environment, including importing necessary libraries and initializing a Spark session.

In [1]:
import os

os.environ['SPARK_HOME'] = 'C:/Users/saulr/anaconda3/envs/pyspark-env/Lib/site-packages/pyspark'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
# Import PySpark and initialize SparkSession
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName('SparkDataFrame').getOrCreate()

# Read CSV file into DataFrame

## Read CSV with header

In [None]:
# Read the CSV file into a DataFrame
df = spark.read.csv('../data/products.csv', header=True, inferSchema=True)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Read CSV with an explicit schema definition

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Define the schema explicitly
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True)
])

# Read the CSV file into a DataFrame with the explicit schema
df = spark.read.csv('../data/products.csv', header=True, schema=schema)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

# Read JSON File into DataFrame

## Single line JSON

In [None]:
# Read the single-line JSON file into a DataFrame
df = spark.read.json('../data/products_singleline.json')

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Multi-lines JSON

In [None]:
# Read the multi-line JSON file into a DataFrame
df = spark.read.json('../data/products_multiline.json', multiLine=True)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Creating a barchart

In [None]:
import matplotlib.pyplot as plt

# Assuming you have already read the JSON file into a PySpark DataFrame
df = spark.read.json('../data/products_singleline.json')

# Convert the PySpark DataFrame to a Pandas DataFrame
pandas_df = df.toPandas()

# Create a bar chart with product prices
plt.figure(figsize=(10, 6))
plt.bar(pandas_df['name'], pandas_df['price'], color='skyblue')
plt.xlabel('Product Name')
plt.ylabel('Price')
plt.title('Product Prices')
plt.xticks(rotation=90)
plt.tight_layout()

# Show the plot
plt.show()

# DataFrame Operations

## Loading the syntethic data into DataFrame

In [None]:
# Read the CSV file into a DataFrame
df = spark.read.csv('../data/stocks.txt', header=True, inferSchema=True)

# Print the DataFrame schema
df.printSchema()

# Show the data
df.show()

## Select: Choose specific columns

In [None]:
# Select specific columns from the DataFrame: name, category, and price
df.select('name', 'category', 'price').show()

## Filter: Apply conditions to filter rows

In [None]:
# Filter rows based on a condition using filter method
df.filter(df['price'] > 100).show()

## GroupBy: Group data based on specific columns

In [None]:
# Group by category and count the number of products in each category
df.groupBy('category').count().show()

# Add aggregation like sum, avg, max, min, etc.
df.groupBy('category').agg({'price': 'sum'}).show()

## Join: Combine multiple DataFrames based on specified columns

In [None]:
# Join with another DataFrame. Create this new DF by filtering the original DF
df2 = df.filter(df['price'] > 100)

# Join the two DataFrames
df.join(df2, 'category').show()

## WithColumn: Add new calculated columns

In [None]:
# Add a new calculated column
df.withColumn('price_after_tax', df['price'] * 1.1).show()
