# Auto reload modules

In [1]:
%load_ext autoreload
%autoreload 2

# Directories

In [2]:
import fs
import os

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "../../spark-3.5.3-bin-hadoop3"

In [4]:
import findspark
findspark.init()

In [18]:
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

from pyspark.ml.feature import (VectorAssembler,
                                OneHotEncoder,
                                StringIndexer,
                                RFormula)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

from pyspark.sql.functions import regexp_replace, col

In [6]:
spark = (SparkSession.builder
  .master('local[*]')
  .appName('hello_world_spark')
  .getOrCreate())

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

24/10/27 21:50:28 WARN Utils: Your hostname, ThinkPad-X1-Nano resolves to a loopback address: 127.0.1.1; using 192.168.68.130 instead (on interface wlp0s20f3)
24/10/27 21:50:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/27 21:50:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/27 21:50:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


You are working with 1 core(s)


In [7]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [8]:
RAW_DIR = fs.open_fs("../../data/raw")
CSV_DIR = RAW_DIR.getsyspath("sf-airbnb.csv")

In [9]:
filePath = CSV_DIR

rawDF = spark.read.csv(filePath,
                       header="true",
                       inferSchema="true",
                       multiLine="true",
                       escape='"')

rawDF.show(3, truncate=False)

24/10/27 21:50:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----+---------------------------------+--------------+------------+------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
rawDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- name: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- space: string (nullable = true)
 |-- description: string (nullable = true)
 |-- experiences_offered: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- transit: string (nullable = true)
 |-- access: string (nullable = true)
 |-- interaction: string (nullable = true)
 |-- house_rules: string (nullable = true)
 |-- thumbnail_url: string (nullable = true)
 |-- medium_url: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- xl_picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = t

In [20]:
rawDF.select("price").show(5)

+-------+
|  price|
+-------+
|$170.00|
|$235.00|
| $65.00|
| $65.00|
|$785.00|
+-------+
only showing top 5 rows



In [37]:
rawDF.select("price").printSchema()

root
 |-- price: integer (nullable = true)



In [25]:
rawDF = rawDF.withColumn("price", regexp_replace(col("price"), r"[$\s]", "").cast("int"))

In [39]:
df_clean = rawDF.select("price", "bedrooms").dropna()

In [41]:
trainDF, testDF = df_clean.randomSplit([.8, .2], seed=42)

print(f"Tenemos {trainDF.cache().count()} muestras en el training set \
y {testDF.cache().count()} en el test set.")

                                                                                

Tenemos 5709 muestras en el training set y 1348 en el test set.


                                                                                

In [42]:
(trainDF
  .select("price", "bedrooms")
  .summary()
  .show())

+-------+------------------+-----------------+
|summary|             price|         bedrooms|
+-------+------------------+-----------------+
|  count|              5709|             5709|
|   mean|190.79471010684884|1.329129444736381|
| stddev|141.73267040414058|0.911901162647597|
|    min|                 0|                0|
|    25%|                99|                1|
|    50%|               150|                1|
|    75%|               230|                2|
|    max|               999|               14|
+-------+------------------+-----------------+



In [43]:
vecAssembler = VectorAssembler(inputCols=["bedrooms"],
                               outputCol="features")

lr = LinearRegression(featuresCol="features",
                      labelCol="price")

In [44]:
pipeline = Pipeline(stages=[vecAssembler, lr])

pipelineModel = pipeline.fit(trainDF)

24/10/27 22:18:01 WARN Instrumentation: [affbc6f3] regParam is zero, which might cause numerical instability and overfitting.


In [45]:
predDF = pipelineModel.transform(testDF)

(predDF
    .select("bedrooms",
            "features",
            "price",
            F.round("prediction",2))
    .show(10))

+--------+--------+-----+--------------------+
|bedrooms|features|price|round(prediction, 2)|
+--------+--------+-----+--------------------+
|       1|   [1.0]|   10|              161.37|
|       1|   [1.0]|   27|              161.37|
|       1|   [1.0]|   28|              161.37|
|       1|   [1.0]|   30|              161.37|
|       1|   [1.0]|   30|              161.37|
|       1|   [1.0]|   30|              161.37|
|       1|   [1.0]|   32|              161.37|
|       1|   [1.0]|   33|              161.37|
|       1|   [1.0]|   35|              161.37|
|       1|   [1.0]|   35|              161.37|
+--------+--------+-----+--------------------+
only showing top 10 rows

