[Reference](https://python.plainenglish.io/big-data-meet-apache-spark-061897b8358d)

# Setting Up PySpark

In [1]:
!pip install pyspark



In [2]:
!python -c "import pyspark; print(pyspark.__version__)"

3.5.3


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark Example") \
    .getOrCreate()

# Loading and Inspecting Data
## Loading Data

In [4]:
data = [("Alice", 29), ("Bob", 35), ("Cathy", 45)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 45|
+-----+---+



# Python Spark Code:

In [11]:
from io import StringIO
import pandas as pd

In [16]:
df = StringIO(
"""
City,Population,Area,Population Density
New York,8419600,789.4,10661
Los Angeles,3980400,1213.9,3283
Chicago,2716000,606.1,4484
Houston,2328000,1625.2,1432
Phoenix,1690000,1340.6,1255
San Antonio,1547253,1194.3,1296
San Diego,1423851,964.5,1475
Dallas,1341000,880.1,1520
Austin,978908,437.2,2244
Miami,467963,143.1,3267
Mexico City,8918653,1485.0,6013
Lagos,9000000,1171.3,7674
Bangkok,8300000,1569.0,5297
Jakarta,10400000,662.3,15796
Manila,1780148,38.55,46190
Singapore,5612300,728.6,7700
""")

In [17]:
df = pd.read_csv(df, sep=",")

In [18]:
df.to_csv("cities_population_density.csv",index=False)

In [21]:
df

DataFrame[City: string, Population: int, Area: double, Population Density: int]

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("City with Highest Population Density") \
    .getOrCreate()

# Load the CSV file into a Spark DataFrame
df = spark.read.csv("cities_population_density.csv", header=True, inferSchema=True)

# Find the city with the highest population density
city_with_highest_density = df.orderBy(col("Population Density").desc()).first()

# Show the result
print(f"The city with the highest population density is {city_with_highest_density['City']} with a population density of {city_with_highest_density['Population Density']} people/km².")

# Stop the Spark session
spark.stop()

The city with the highest population density is Manila with a population density of 46190 people/km².


In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("City with Highest Population Density") \
    .getOrCreate()

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

data = [(1.0, 2.0), (2.0, 4.0), (3.0, 6.0), (4.0, 8.0)]
columns = ["Feature", "Label"]

df = spark.createDataFrame(data, columns)

vector_assembler = VectorAssembler(inputCols=["Feature"], outputCol="Features")
transformed_data = vector_assembler.transform(df)

In [31]:
lr = LinearRegression(featuresCol="Features", labelCol="Label")
model = lr.fit(transformed_data)

print(f"Intercept: {model.intercept}")
print(f"Coefficients: {model.coefficients}")

Intercept: 0.0
Coefficients: [2.0]
