## Academic exercise for study


### Environment installation and configuration

In [1]:
# Python interface to Spark
!pip install pyspark --quiet
# Installation and update of the PyDrive library, for interacting with Google Drive using Python.
!pip install -U -q PyDrive --quiet
# Install OpenJDK 8
!apt install openjdk-8-jdk-headless &> /dev/null
# Download the ngrok zip file to access the local server over the internet
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip &> /dev/null
# Unzip the ngrok zip file
!unzip ngrok-stable-linux-amd64.zip &> /dev/null
# Starts ngrok, allowing HTTP traffic on port 4050
get_ipython().system_raw('./ngrok http 4050 &')
# Import the Python os module
import os
# Sets the JAVA_HOME environment variable to the location of Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


### Setting up and starting a Spark session using the PySpark library

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().set('spark.ui.port', '4050').setAppName("housing").setMaster("local[2]")
sc = SparkSession.builder.config(conf=conf).getOrCreate()

### Load data

In [3]:
import urllib.request
# Path
url = 'https://raw.githubusercontent.com/E-man85/Big-Data/main/03-data/boston.csv'
# Local path to save the file
local_file_path = '/content/boston.csv'
# Download file from remote URL
urllib.request.urlretrieve(url, local_file_path)
df_spark = sc.read.csv("/content/boston.csv", inferSchema=True, header=True)
# View schema
df_spark.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



### Spark SQL operations

In [4]:
# Register the DataFrame as a temporary view
df_spark.createOrReplaceTempView("data_view")
# Perform Spark SQL operations
result = sc.sql("SELECT CRIM, INDUS, MEDV FROM data_view WHERE CRIM > 5.0")
# Show the result
result.show()

+-------+-----+----+
|   CRIM|INDUS|MEDV|
+-------+-----+----+
|8.98296| 18.1|17.8|
|5.20177| 18.1|22.7|
|13.5222| 18.1|23.1|
|5.66998| 18.1|50.0|
|6.53876| 18.1|50.0|
| 9.2323| 18.1|50.0|
|8.26725| 18.1|50.0|
|11.1081| 18.1|13.8|
|18.4982| 18.1|13.8|
|19.6091| 18.1|15.0|
| 15.288| 18.1|13.9|
|9.82349| 18.1|13.3|
|23.6482| 18.1|13.1|
|17.8667| 18.1|10.2|
|88.9762| 18.1|10.4|
|15.8744| 18.1|10.9|
|9.18702| 18.1|11.3|
|7.99248| 18.1|12.3|
|20.0849| 18.1| 8.8|
|16.8118| 18.1| 7.2|
+-------+-----+----+
only showing top 20 rows



### RDD operations

In [5]:
# Perform RDD operations
rdd_crim = df_spark.rdd
result_rdd = rdd_crim.filter(lambda row: row[0] > 5.0) \
    .map(lambda row: (row[0], row[2], row[13]))
# Show the result
for row in result_rdd.collect():
    print(row)

(8.98296, 18.1, 17.8)
(5.20177, 18.1, 22.7)
(13.5222, 18.1, 23.1)
(5.66998, 18.1, 50.0)
(6.53876, 18.1, 50.0)
(9.2323, 18.1, 50.0)
(8.26725, 18.1, 50.0)
(11.1081, 18.1, 13.8)
(18.4982, 18.1, 13.8)
(19.6091, 18.1, 15.0)
(15.288, 18.1, 13.9)
(9.82349, 18.1, 13.3)
(23.6482, 18.1, 13.1)
(17.8667, 18.1, 10.2)
(88.9762, 18.1, 10.4)
(15.8744, 18.1, 10.9)
(9.18702, 18.1, 11.3)
(7.99248, 18.1, 12.3)
(20.0849, 18.1, 8.8)
(16.8118, 18.1, 7.2)
(24.3938, 18.1, 10.5)
(22.5971, 18.1, 7.4)
(14.3337, 18.1, 10.2)
(8.15174, 18.1, 11.5)
(6.96215, 18.1, 15.1)
(5.29305, 18.1, 23.2)
(11.5779, 18.1, 9.7)
(8.64476, 18.1, 13.8)
(13.3598, 18.1, 12.7)
(8.71675, 18.1, 13.1)
(5.87205, 18.1, 12.5)
(7.67202, 18.1, 8.5)
(38.3518, 18.1, 5.0)
(9.91655, 18.1, 6.3)
(25.0461, 18.1, 5.6)
(14.2362, 18.1, 7.2)
(9.59571, 18.1, 12.1)
(24.8017, 18.1, 8.3)
(41.5292, 18.1, 8.5)
(67.9208, 18.1, 5.0)
(20.7162, 18.1, 11.9)
(11.9511, 18.1, 27.9)
(7.40389, 18.1, 17.2)
(14.4383, 18.1, 27.5)
(51.1358, 18.1, 15.0)
(14.0507, 18.1, 17.2)
(1

### Calculate descriptive statistics

In [7]:
#from pyspark.mllib.stat import Statistics
#rdd_stats = df_spark.rdd
#summary = Statistics.colStats(rdd_stats)
## Print the result
#print("Count:", summary.count())
#print("Mean:", summary.mean())
#print("Variance:", summary.variance())
#print("Non-zero values:", summary.numNonzeros())
#print("Maximum:", summary.max())
#print("Minimum:", summary.min())

In [8]:
# Calculate descriptive statistics
statistics_df = df_spark.describe()
# Show the result
statistics_df.show()

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|                NOX|                RM|               AGE|              DIS|              RAD|               TAX|           PTRATIO|                 B|             LSTAT|              MEDV|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|  count|               506|               506|               506|               506|                506|               506|               506|              506|              

### Prepare the data for training

In [10]:
from pyspark.ml.feature import VectorAssembler
# Creating a VectorAssembler object
assembler = VectorAssembler(
    inputCols=["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"],
    outputCol="features"
)
data_features = assembler.transform(df_spark).select("features", "MEDV")
# Split the data into training and testing sets
train_data, test_data = data_features.randomSplit([0.8, 0.2], seed=42)


### Linear Regression model

In [11]:
# Create a Linear Regression model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol="MEDV")
# Train the model
model = lr.fit(train_data)
# Make predictions on the test data
predictions = model.transform(test_data)
# Display predicted and actual prices
predictions.select("prediction", "MEDV").show()
# Evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="MEDV", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

+------------------+----+
|        prediction|MEDV|
+------------------+----+
| 27.48227401818597|22.0|
| 40.59821928572499|50.0|
|31.560171030407147|29.1|
| 30.50410754091404|32.9|
| 36.71084264945591|42.3|
|30.375442767948094|34.7|
| 19.59899004373794|18.5|
|34.091019647336616|34.9|
|30.499298725303746|23.5|
|31.920010397360237|27.9|
|26.093385173968073|24.8|
| 21.68551219244037|20.7|
|27.359893315366023|23.2|
| 28.33678346582046|28.0|
| 30.92596058671456|24.8|
|27.029279789450797|22.6|
|22.283609778223155|22.5|
|23.315067240745766|22.4|
| 30.18759562782573|30.5|
| 22.68581977785566|20.3|
+------------------+----+
only showing top 20 rows

Root Mean Squared Error (RMSE): 4.671806485171285


### KMeans model

In [12]:
from pyspark.ml.clustering import KMeans
# Create a KMeans model
kmeans = KMeans(k=5, seed=42)
# Train the model
model = kmeans.fit(data_features)
# Make predictions on the data
predictions = model.transform(data_features)
# Display the cluster assignment for each data point
predictions.select("features", "prediction").show()
# Evaluate the clustering model (if you have ground truth labels)
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
silhouette_score = evaluator.evaluate(predictions)
print("Silhouette Score:", silhouette_score)

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.00632,18.0,2.3...|         4|
|[0.02731,0.0,7.07...|         2|
|[0.02729,0.0,7.07...|         2|
|[0.03237,0.0,2.18...|         2|
|[0.06905,0.0,2.18...|         2|
|[0.02985,0.0,2.18...|         2|
|[0.08829,12.5,7.8...|         4|
|[0.14455,12.5,7.8...|         4|
|[0.21124,12.5,7.8...|         4|
|[0.17004,12.5,7.8...|         4|
|[0.22489,12.5,7.8...|         4|
|[0.11747,12.5,7.8...|         4|
|[0.09378,12.5,7.8...|         4|
|[0.62976,0.0,8.14...|         4|
|[0.63796,0.0,8.14...|         4|
|[0.62739,0.0,8.14...|         4|
|[1.05393,0.0,8.14...|         4|
|[0.7842,0.0,8.14,...|         4|
|[0.80271,0.0,8.14...|         4|
|[0.7258,0.0,8.14,...|         4|
+--------------------+----------+
only showing top 20 rows

Silhouette Score: 0.6354883093822121
