[Reference](https://medium.com/@nomannayeem/pyspark-made-simple-from-basics-to-big-data-mastery-cb1d702968be)

# Creating Your First PySpark Application

In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("SimpleApp") \
    .getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

# Create a DataFrame from the data
df = spark.createDataFrame(data, ["Name", "Age"])

# Show the DataFrame content
df.show()

# Stop the Spark session
spark.stop()

# Setting up PySpark: Your First Step to Big Data Processing

## Step 1: Install Java (JDK)

```
java version "17.0.1" // or the latest version
```

## Step 2: Install Apache Spark
Go to the Apache Spark Downloads page and download it

## Step 3: Install Python
```
python --version
```

## Step 4: Set Up PySpark Environment

### 1. Create a Virtual Environment:
```
python -m venv pyspark-env
```

## 2. Activate the Environment:

```
# Mac
source pyspark-env/bin/activate

# Windows
pyspark-env\Scripts\activate
```

### 3. Install PySpark:
```
pip install pyspark
```

## Step 5: Running PySpark with JupyterLab


### Install JupyterLab:
```
Install JupyterLab:
```

### Install JupyterLab:
```
jupyter-lab
```

### Running Your First PySpark Code

In [2]:
from pyspark.sql import SparkSession

# Start a Spark session
spark = SparkSession.builder \
    .appName("TestSparkSetup") \
    .getOrCreate()

# Print Spark session info
print(spark.version)

# Stop the Spark session
spark.stop()

## Working with Resilient Distributed Datasets (RDDs)


### Creating an RDD

In [3]:
from pyspark import SparkContext

# Initialize a SparkContext
sc = SparkContext("local", "RDD Example")

# Create an RDD from a Python list
numbers_rdd = sc.parallelize([1, 2, 3, 4, 5])

# Perform an action to collect the results
collected_numbers = numbers_rdd.collect()

# Print the result
print(collected_numbers)

# Stop the SparkContext
sc.stop()

### RDD Transformations and Actions

In [4]:
# Create an RDD from a list
numbers_rdd = sc.parallelize([1, 2, 3, 4, 5])

# Multiply each element by 2
doubled_rdd = numbers_rdd.map(lambda x: x * 2)

# Collect and print the result
print(doubled_rdd.collect())

In [5]:
# Count the number of elements in the RDD
print(numbers_rdd.count())

## Working with DataFrames and DataFrame Operations

### Creating a DataFrame

In [6]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("DataFrame Example") \
    .getOrCreate()

# Sample data: list of tuples
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

# Define column names
columns = ["Name", "Age"]

# Create a DataFrame from the data
df = spark.createDataFrame(data, columns)

# Show the DataFrame content
df.show()

# Stop the SparkSession
spark.stop()

### DataFrame Operations

In [7]:
# Select the "Name" column
df.select("Name").show()

# Select multiple columns
df.select("Name", "Age").show()

# Filter rows where age is greater than 30
df.filter(df.Age > 30).show()

# Group by age and count occurrences
df.groupBy("Age").count().show()

# Sort the DataFrame by age
df.orderBy("Age").show()

# Sort by age in descending order
df.orderBy(df.Age.desc()).show()

## Spark SQL and Querying DataFrames


In [8]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Spark SQL Example") \
    .getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

# Create a DataFrame
df = spark.createDataFrame(data, ["Name", "Age"])

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("people")

# Run a SQL query
result = spark.sql("SELECT Name, Age FROM people WHERE Age > 30")

# Show the result of the query
result.show()

# Stop the SparkSession
spark.stop()

### Common SQL Queries in Spark SQL

In [9]:
result = spark.sql("SELECT Name FROM people")
result.show()

result = spark.sql("SELECT * FROM people WHERE Age < 30")
result.show()

result = spark.sql("SELECT Age, COUNT(*) as count FROM people GROUP BY Age")
result.show()

# Sample data
data_jobs = [("Alice", "Engineer"), ("Bob", "Doctor")]
df_jobs = spark.createDataFrame(data_jobs, ["Name", "Job"])

# Register jobs DataFrame as a temporary view
df_jobs.createOrReplaceTempView("jobs")

# SQL join query
result = spark.sql("""
    SELECT p.Name, p.Age, j.Job
    FROM people p
    JOIN jobs j ON p.Name = j.Name
""")

result.show()

## Advanced DataFrame Operations

### Handling Missing Data

#### Dropping Rows with Missing Data

In [10]:
# Drop rows with any missing values
df_cleaned = df.dropna()
df_cleaned.show()

In [11]:
# Drop rows where the "Age" column has missing values
df_cleaned = df.dropna(subset=["Age"])
df_cleaned.show()

#### Filling Missing Data

In [12]:
# Fill missing values in all columns with a default value
df_filled = df.fillna(0)
df_filled.show()

# Fill missing values in specific columns
df_filled = df.fillna({"Age": 0, "Name": "Unknown"})
df_filled.show()

### Window Functions in PySpark

In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Define a window specification
window_spec = Window.orderBy("Age")

# Add a rank column
df_with_rank = df.withColumn("rank", rank().over(window_spec))
df_with_rank.show()

### Optimizing DataFrame Queries


#### Caching DataFrames

In [14]:
# Cache the DataFrame
df.cache()

# Perform operations on the cached DataFrame
df.show()

#### Partitioning

In [15]:
# Repartition the DataFrame
df_repartitioned = df.repartition(4)
df_repartitioned.show()

## Advanced Topics in PySpark

### Streaming Data in PySpark

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# Read streaming data from a socket
lines = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(
   explode(split(lines.value, " ")).alias("word")
)

# Generate running word count
word_counts = words.groupBy("word").count()

# Start running the query that prints the running counts to the console
query = word_counts.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()

### Building a Machine Learning Pipeline with PySpark

In [17]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Sample data
data = [(0, 1.0, 0.5), (1, 2.0, 1.5), (0, 0.5, 0.3), (1, 2.5, 1.7)]
df = spark.createDataFrame(data, ["label", "feature1", "feature2"])

# Assemble features into a vector
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")

# Define a Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Create a pipeline with the assembler and the logistic regression model
pipeline = Pipeline(stages=[assembler, lr])

# Train the model
model = pipeline.fit(df)

# Make predictions
predictions = model.transform(df)

# Show the predictions
predictions.select("label", "features", "prediction").show()

## Real-World Project: End-to-End Data Processing with PySpark

### Step 1: Load the Data

In [18]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Customer Churn Prediction") \
    .getOrCreate()

# Load the customer churn dataset
data = spark.read.csv("customer_churn.csv", header=True, inferSchema=True)

# Show the data schema and first few rows
data.printSchema()
data.show(5)

### Step 2: Data Cleaning and Preprocessing

In [19]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Convert categorical columns to numerical ones using StringIndexer
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="PaymentMethod", outputCol="PaymentMethodIndex")
data_indexed = indexer.fit(data_cleaned).transform(data_cleaned)

# Show the processed data
data_indexed.show(5)

### Step 3: Feature Engineering

In [20]:
from pyspark.ml.feature import VectorAssembler

# Select the features and label column
assembler = VectorAssembler(
    inputCols=["Tenure", "MonthlyCharges", "TotalCharges", "PaymentMethodIndex"],
    outputCol="features"
)

# Apply the assembler to the DataFrame
data_prepared = assembler.transform(data_indexed)

# Show the prepared data
data_prepared.select("features", "Churn").show(5)

### Step 4: Build and Train the Model

In [21]:
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="Churn")

# Train the model
model = lr.fit(data_prepared)

# Make predictions on the dataset
predictions = model.transform(data_prepared)

# Show the predictions
predictions.select("Churn", "prediction", "probability").show(5)

### Step 5: Evaluate the Model

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize the evaluator
evaluator = BinaryClassificationEvaluator(labelCol="Churn", metricName="areaUnderROC")

# Evaluate the model
roc_auc = evaluator.evaluate(predictions)
print(f"Area under ROC curve: {roc_auc}")