# Imports

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")  

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU no encontrada')
print('Encontrada GPU: {}'.format(device_name))

Encontrada GPU: /device:GPU:0


# Acesso a los datos

In [3]:
%cd "/content/gdrive/Shareddrives/Cortesanas de IA/Reto/Modelado_v2"
!ls

/content/gdrive/Shareddrives/Cortesanas de IA/Reto/Modelado_v2
antenas.csv		   new_train.csv
datasets		   obtain_data_Chile.ipynb
final.csv		   spark-3.2.2-bin-hadoop3.2
Get_%_of_homeoffice.ipynb  spark-3.2.2-bin-hadoop3.2.tgz
Getting_model.ipynb	   spark-3.2.2-bin-hadoop3.2.tgz.1
HomeOffice.csv		   SparkML.ipynb
merge_data_Chile.ipynb


# PySpark

## Configuracion

In [4]:
#Bibliotecas para poder trabajar con Spark
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q https://downloads.apache.org/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz
!tar xf spark-3.2.2-bin-hadoop3.2.tgz  
#Configuración de Spark con Python
!pip install -q findspark
!pip install pyspark

#Estableciendo variable de entorno
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.2-bin-hadoop3.2"

#Buscando e inicializando la instalación de Spark
import findspark
findspark.init()
findspark.find()

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.[0m[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36[0m                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36[0m                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:6 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:7 http://archiv

'spark-3.2.2-bin-hadoop3.2'

Now, we can import SparkSession from pyspark.sql and create a SparkSession, which is the entry point to Spark.

You can give a name to the session using appName() and add some configurations with config() if you wish.

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("SparkML")\
        .getOrCreate()

In [6]:
spark

## Loading data into Spark

In [7]:
df = spark.read.csv("final.csv", header=True, inferSchema=True)

## Exploring Data

In [8]:
# Show column detail
df.printSchema()

root
 |-- Unnamed: 0: string (nullable = true)
 |-- %_of_homeoffice: double (nullable = true)
 |-- Cantidad de Conexiones de internet fijas: double (nullable = true)
 |-- Numero de empresas sin ventas: double (nullable = true)
 |-- Cantidad de trabajadores en empresas sin ventas: double (nullable = true)
 |-- Numero de empresas Micro 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Micro 1: double (nullable = true)
 |-- Numero de empresas Pequeña 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Pequeña 1: double (nullable = true)
 |-- Numero de empresas Mediana 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Mediana 1: double (nullable = true)
 |-- Numero de empresas Grande 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Grande 1: double (nullable = true)
 |-- Consumo de Electricidad en Servicios Comunitarios en M$: string (nullable = true)
 |-- Consumo de Electricidad en M$: string (nullable = true)


In [9]:
# Display Rows
df.show(5)

+----------------+------------------+----------------------------------------+-----------------------------+-----------------------------------------------+--------------------------+--------------------------------------------+----------------------------+----------------------------------------------+----------------------------+----------------------------------------------+---------------------------+---------------------------------------------+-------------------------------------------------------+-----------------------------+------------------------------------------------------+---------------------+------------------------+-------------------------------------+----------------------------------------+
|      Unnamed: 0|   %_of_homeoffice|Cantidad de Conexiones de internet fijas|Numero de empresas sin ventas|Cantidad de trabajadores en empresas sin ventas|Numero de empresas Micro 1|Cantidad de trabajadores en empresas Micro 1|Numero de empresas Pequeña 1|Cantidad de trabajado

In [10]:
# Describing the columns
df.describe().show()

+-------+----------+------------------+----------------------------------------+-----------------------------+-----------------------------------------------+--------------------------+--------------------------------------------+----------------------------+----------------------------------------------+----------------------------+----------------------------------------------+---------------------------+---------------------------------------------+-------------------------------------------------------+-----------------------------+------------------------------------------------------+---------------------+------------------------+-------------------------------------+----------------------------------------+
|summary|Unnamed: 0|   %_of_homeoffice|Cantidad de Conexiones de internet fijas|Numero de empresas sin ventas|Cantidad de trabajadores en empresas sin ventas|Numero de empresas Micro 1|Cantidad de trabajadores en empresas Micro 1|Numero de empresas Pequeña 1|Cantidad de traba

## Preprocessing Data

In [11]:
from pyspark.sql.functions import col
df = df.withColumn("Consumo de Electricidad en Servicios Comunitarios en M$",col("Consumo de Electricidad en Servicios Comunitarios en M$").cast('double'))
df = df.withColumn("Consumo de Electricidad en M$",col("Consumo de Electricidad en M$").cast('double'))
df = df.withColumn("Consumo de Electricidad Dependencias Municipales en M$",col("Consumo de Electricidad Dependencias Municipales en M$").cast('double'))

In [12]:
# Show column detail
df.printSchema()

root
 |-- Unnamed: 0: string (nullable = true)
 |-- %_of_homeoffice: double (nullable = true)
 |-- Cantidad de Conexiones de internet fijas: double (nullable = true)
 |-- Numero de empresas sin ventas: double (nullable = true)
 |-- Cantidad de trabajadores en empresas sin ventas: double (nullable = true)
 |-- Numero de empresas Micro 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Micro 1: double (nullable = true)
 |-- Numero de empresas Pequeña 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Pequeña 1: double (nullable = true)
 |-- Numero de empresas Mediana 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Mediana 1: double (nullable = true)
 |-- Numero de empresas Grande 1: double (nullable = true)
 |-- Cantidad de trabajadores en empresas Grande 1: double (nullable = true)
 |-- Consumo de Electricidad en Servicios Comunitarios en M$: double (nullable = true)
 |-- Consumo de Electricidad en M$: double (nullable = true)


In [13]:
df = df.na.fill(0)

In [14]:
# Describing the columns
df.describe().show()

+-------+----------+------------------+----------------------------------------+-----------------------------+-----------------------------------------------+--------------------------+--------------------------------------------+----------------------------+----------------------------------------------+----------------------------+----------------------------------------------+---------------------------+---------------------------------------------+-------------------------------------------------------+-----------------------------+------------------------------------------------------+---------------------+------------------------+-------------------------------------+----------------------------------------+
|summary|Unnamed: 0|   %_of_homeoffice|Cantidad de Conexiones de internet fijas|Numero de empresas sin ventas|Cantidad de trabajadores en empresas sin ventas|Numero de empresas Micro 1|Cantidad de trabajadores en empresas Micro 1|Numero de empresas Pequeña 1|Cantidad de traba

## Models

In [15]:
modelsResults = [['Model','RMSE','MSE','MAE','R2']]

In [16]:
# Imports
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [17]:
# Using VectorAssembler
assembler = VectorAssembler(inputCols = ['Cantidad de Conexiones de internet fijas', 'Numero de empresas sin ventas', 'Cantidad de trabajadores en empresas sin ventas', 'Numero de empresas Micro 1', 'Cantidad de trabajadores en empresas Micro 1', 'Numero de empresas Pequeña 1','Cantidad de trabajadores en empresas Pequeña 1','Numero de empresas Mediana 1','Cantidad de trabajadores en empresas Mediana 1','Numero de empresas Grande 1','Cantidad de trabajadores en empresas Grande 1','Consumo de Electricidad en Servicios Comunitarios en M$','Consumo de Electricidad en M$','Consumo de Electricidad Dependencias Municipales en M$','Empresas informaticas','Empresas no informaticas','Trabajadores en empresas informaticas','Trabajadores en empresas no informaticas'], outputCol='features')
output = assembler.transform(df)

In [18]:
output.show()

+-----------------+------------------+----------------------------------------+-----------------------------+-----------------------------------------------+--------------------------+--------------------------------------------+----------------------------+----------------------------------------------+----------------------------+----------------------------------------------+---------------------------+---------------------------------------------+-------------------------------------------------------+-----------------------------+------------------------------------------------------+---------------------+------------------------+-------------------------------------+----------------------------------------+--------------------+
|       Unnamed: 0|   %_of_homeoffice|Cantidad de Conexiones de internet fijas|Numero de empresas sin ventas|Cantidad de trabajadores en empresas sin ventas|Numero de empresas Micro 1|Cantidad de trabajadores en empresas Micro 1|Numero de empresas Pequeña 

In [19]:
# Selecting the input and output columns for modeling
finalised_data = output.select('%_of_homeoffice','features')
finalised_data.show()

+------------------+--------------------+
|   %_of_homeoffice|            features|
+------------------+--------------------+
| 61.26060706653665|[123359.0,13507.0...|
|58.401903053464885|[171687.0,15587.0...|
| 59.68975311339305|[5635.0,306.0,804...|
| 59.54130366327281|[52799.0,1410.0,1...|
| 60.14623850444746|[39371.0,2102.0,2...|
| 59.12806539509537|[41276.0,1288.0,1...|
|  61.9877800407332|[22192.0,981.0,19...|
|  59.7819850831899|[61302.0,1827.0,2...|
| 62.19272369714847|[28820.0,1509.0,2...|
|62.821141057052856|[29817.0,976.0,33...|
| 61.35646687697161|[27202.0,2248.0,2...|
| 62.48118269193654|[158938.0,4554.0,...|
| 61.83663283979371|[21439.0,1152.0,9...|
| 61.85176414870945|[51909.0,1788.0,3...|
|53.756994404476416|[873.0,229.0,1565...|
|   60.703081232493|[115763.0,3939.0,...|
|  58.6189683860233|[17607.0,585.0,11...|
| 61.22916666666666|[34864.0,3078.0,4...|
| 63.52806995311114|[13601.0,495.0,46...|
| 62.53512401265571|[153696.0,4301.0,...|
+------------------+--------------

### Linear regression

In [20]:
#Imports
from pyspark.ml.regression import LinearRegression

In [21]:
# Splitting the data
train, test = finalised_data.randomSplit([0.8, 0.2])

In [22]:
# Create Model
lr = LinearRegression(labelCol="%_of_homeoffice", featuresCol="features")

In [23]:
# Fit the model
lrModel = lr.fit(train)

In [24]:
# Predict
lr_predictions = lrModel.evaluate(test)

In [25]:
lr_predictions.predictions.show()



+-----------------+--------------------+------------------+
|  %_of_homeoffice|            features|        prediction|
+-----------------+--------------------+------------------+
|46.66666666666666|[52.0,76.0,379.0,...|61.740804130184536|
| 58.6189683860233|[17607.0,585.0,11...| 60.72071127977189|
|58.63636363636363|[6607.0,922.0,800...| 60.15586008094017|
|59.34154175588865|[23731.0,886.0,66...|60.459300963105996|
|59.54130366327281|[52799.0,1410.0,1...| 60.00811110693684|
|59.68975311339305|[5635.0,306.0,804...| 60.84052604945496|
|60.14623850444746|[39371.0,2102.0,2...| 58.25051443914035|
|61.83663283979371|[21439.0,1152.0,9...| 60.91789553865009|
| 63.0410198416778|[20606.0,1290.0,2...| 60.21687223118968|
+-----------------+--------------------+------------------+



In [26]:
# Evaluate
eval = RegressionEvaluator(labelCol="%_of_homeoffice", predictionCol="prediction", metricName="rmse")

# RMSE
rmse = eval.evaluate(lr_predictions.predictions)
print("RMSE: %.3f" % rmse)

#  MSE
mse = eval.evaluate(lr_predictions.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# MAE
mae = eval.evaluate(lr_predictions.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# R2
r2 = eval.evaluate(lr_predictions.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 5.262
MSE: 27.685
MAE: 3.008
r2: -0.403


In [27]:
# Save model results
result_lr= ['Linear regression', rmse, mse, mae, r2]
modelsResults.append(result_lr)

### Gradient-Boosted Trees (GBTs)

#### Modelo inicial

In [28]:
# Imports
from pyspark.ml.regression import GBTRegressor

In [29]:
# Splitting the data
train, test = finalised_data.randomSplit([0.8, 0.2])

In [30]:
# Create Model
gb = GBTRegressor(labelCol="%_of_homeoffice", featuresCol="features")

In [31]:
# Fit the model
gbModel = gb.fit(train)

In [32]:
# Predict
gb_predictions = gbModel.transform(test)

In [33]:
gb_predictions.show()

+------------------+--------------------+------------------+
|   %_of_homeoffice|            features|        prediction|
+------------------+--------------------+------------------+
|58.401903053464885|[171687.0,15587.0...| 64.72363513973845|
| 60.14623850444746|[39371.0,2102.0,2...|59.521715941200895|
|  60.8017608766918|[73198.0,11876.0,...| 64.72363513973845|
| 61.05184985147178|[55568.0,1918.0,3...| 59.64077250093862|
| 61.05769230769231|[13844.0,921.0,45...| 60.70904204525256|
| 61.26060706653665|[123359.0,13507.0...| 64.72363513973845|
| 62.00632684342693|[69448.0,2798.0,4...| 62.12949580089425|
| 62.53512401265571|[153696.0,4301.0,...| 60.67469979957507|
+------------------+--------------------+------------------+



In [34]:
# Evaluate
eval = RegressionEvaluator(labelCol="%_of_homeoffice", predictionCol="prediction", metricName="rmse")

# RMSE
rmse = eval.evaluate(gb_predictions)
print("RMSE: %.3f" % rmse)

#  MSE
mse = eval.evaluate(gb_predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# MAE
mae = eval.evaluate(gb_predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# R2
r2 = eval.evaluate(gb_predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 3.027
MSE: 9.165
MAE: 2.259
r2: -5.730


In [35]:
# Save model results
result_gb= ['GBT 1.0', rmse, mse, mae, r2]
modelsResults.append(result_gb)

#### Modelo  Final

In [36]:
# Splitting the data
train, test = finalised_data.randomSplit([0.8, 0.2])

In [37]:
# Create Model
gb2 = GBTRegressor(labelCol="%_of_homeoffice", featuresCol="features", maxDepth = 21, maxIter = 40, maxBins = 18)

In [38]:
# Fit the model
gbModel2 = gb2.fit(train)

In [39]:
# Predict
gb_predictions2 = gbModel2.transform(test)

In [40]:
gb_predictions2.show()

+------------------+--------------------+------------------+
|   %_of_homeoffice|            features|        prediction|
+------------------+--------------------+------------------+
| 55.53004209720628|[3636.0,394.0,149...| 59.92915190664722|
| 59.34154175588865|[23731.0,886.0,66...| 60.40983606557377|
| 60.14623850444746|[39371.0,2102.0,2...| 61.85176414870943|
|   60.703081232493|[115763.0,3939.0,...| 61.22916666666666|
|  60.8017608766918|[73198.0,11876.0,...| 61.26060706653664|
| 62.48118269193654|[158938.0,4554.0,...|61.229166666666636|
| 62.64820592823713|[98545.0,4110.0,4...| 61.26060706653664|
|62.821141057052856|[29817.0,976.0,33...| 62.30218093063537|
| 62.85784494409633|[23271.0,804.0,11...| 60.40983606557377|
| 70.12269938650307|[198.0,137.0,69.0...| 73.49726775956285|
+------------------+--------------------+------------------+



In [41]:
# Evaluate
eval = RegressionEvaluator(labelCol="%_of_homeoffice", predictionCol="prediction", metricName="rmse")

# RMSE
rmse = eval.evaluate(gb_predictions2)
print("RMSE: %.3f" % rmse)

#  MSE
mse = eval.evaluate(gb_predictions2, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# MAE
mae = eval.evaluate(gb_predictions2, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# R2
r2 = eval.evaluate(gb_predictions2, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 2.122
MSE: 4.503
MAE: 1.714
r2: 0.634


In [42]:
# Save model results
result_gb2= ['GBT 2.0', rmse, mse, mae, r2]
modelsResults.append(result_gb2)

In [43]:
modelsResults

[['Model', 'RMSE', 'MSE', 'MAE', 'R2'],
 ['Linear regression',
  5.261611960192971,
  27.684560419645717,
  3.007702818413904,
  -0.4027966207797127],
 ['GBT 1.0',
  3.027313811635314,
  9.164628914117934,
  2.259309721161179,
  -5.730423781702128],
 ['GBT 2.0',
  2.122081449802578,
  4.5032296795962115,
  1.713901365237674,
  0.6335075638087074]]

In [44]:
def Mostrar():
    print("Resultados de lo modelos:")
    for fila in modelsResults:
        for valor in fila:
            print("\t", valor, end=" ")
        print()

In [46]:
Mostrar()

Resultados de lo modelos:
	 Model 	 RMSE 	 MSE 	 MAE 	 R2 
	 Linear regression 	 5.261611960192971 	 27.684560419645717 	 3.007702818413904 	 -0.4027966207797127 
	 GBT 1.0 	 3.027313811635314 	 9.164628914117934 	 2.259309721161179 	 -5.730423781702128 
	 GBT 2.0 	 2.122081449802578 	 4.5032296795962115 	 1.713901365237674 	 0.6335075638087074 
