In [2]:
#Realizamos las importaciones necesarias para empezar
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.appName('Tarea').getOrCreate()

21/10/31 21:15:09 WARN Utils: Your hostname, black19hunter resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
21/10/31 21:15:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/31 21:15:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
#Cargamos el archivo que nos funcionara como la base de datos
df=spark.read.csv("hyundai cruise ship info.csv", inferSchema = True, header = True)

                                                                                

In [5]:
#printSchema nos muestra la informacion general sobre las columnas existentes
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [13]:
#esta linea nos muestra un resumen de los datos del dataframe
df.describe().show()

                                                                                

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     null|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [10]:
#Importamos StringIndexer
#StringIndexer asigna una columna de cadenas de etiquetas a una columna ML de índices de etiquetas. 
#Si la columna de entrada es numérica, la convertimos en una cadena e indexamos los valores de la cadena.
from pyspark.ml.feature import StringIndexer

In [11]:
#Aqui convertimos a cadena e indexamos los valores
indexer = StringIndexer(inputCol = 'Cruise_line', outputCol = 'cruise_category')
indexed = indexer.fit(df).transform(df)
indexed.head(1)

                                                                                

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_category=16.0)]

In [12]:
#aqui vemos las columnas registradas
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_category']

In [13]:
#importamos mas librerias necesarias
#VectorAssembler:Un transformador de características que fusiona varias columnas en una columna vectorial.
#Vectors:Métodos de fábrica para trabajar con vectores.

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
#Aqui se seleccionan las columnas que seran fusionadas y se define la columna de salida que contendra
#la fucion de todas las columnas y la variable o dataset que contendra esos valores
assembler = VectorAssembler(inputCols = ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_category'], outputCol = 'features')

#en la instruccion assembler.transform se transforma el conjunto de datos de entrada con parámetros opcionales.
output = assembler.transform(indexed)

In [15]:
#aqui se muestran las columnas existentes
#se puede observar que aparece la columna creada anteriormente llamada features
output.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_category',
 'features']

In [16]:
#este data set contendra solo 2 columnas las cuales son seleccionadas
final_data = output.select(['features','crew'])
output.select(['features', 'crew']).show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [17]:
#importamos LinearRegression
#LinearRegression: El objetivo de aprendizaje es minimizar la función de pérdida especificada, 
#con regularización. Esto admite dos tipos de pérdida:
#1-squareError (también conocido como pérdida al cuadrado)
#huber (un híbrido de error cuadrado para errores relativamente pequeños y error absoluto para errores 
#relativamente grandes, y estimamos el parámetro de escala a partir de los datos de entrenamiento)

from pyspark.ml.regression import LinearRegression

In [18]:
#Seleccionamos columna para regreson linear
model = LinearRegression(labelCol= 'crew')

In [20]:
#Aqui se prueba el modelo antes de ser entrenado
train_data, test_data = final_data.randomSplit([0.7,0.3], seed = 1234)
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               104|
|   mean|7.8663461538461625|
| stddev| 3.340824285732549|
|    min|              0.88|
|    max|              19.1|
+-------+------------------+



In [21]:
#aqui se entrena el modelo
trained_model = model.fit(train_data)

21/10/31 21:27:58 WARN Instrumentation: [19fe2429] regParam is zero, which might cause numerical instability and overfitting.
21/10/31 21:27:59 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
21/10/31 21:27:59 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
21/10/31 21:27:59 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [22]:
#asignamos los resultados de evaluacion del modelo entrenado
results = trained_model.evaluate(test_data)

In [23]:
#mostramos resultados
results.residuals.show()



+--------------------+
|           residuals|
+--------------------+
|  0.3907617507368393|
| -1.3231760234284078|
|-0.00470018105311...|
| 0.42353817560602103|
|  -0.590615509738539|
| -0.5270445767004297|
| -0.4381410108920214|
|  1.0988796006491341|
|-0.09517679473458429|
|-0.38095805561306406|
|  -0.318811648310362|
| -0.3169629832857588|
|  0.9235881855682031|
|   0.714761719631607|
|   0.079251944729961|
|-0.49423686557798874|
|  0.8046967352123495|
|   0.796610479216028|
| 0.22168740717856217|
| -0.9818072806326583|
+--------------------+
only showing top 20 rows



In [27]:
#perdida, marcador, error cuadratico medio, error absoluto medio
print('RMSE: {}'.format(results.rootMeanSquaredError))
print('MAE: {}'.format(results.meanAbsoluteError))
print('R2: {}'.format(results.r2))

RMSE: 0.5772151741728508
MAE: 0.4648120224953151
R2: 0.9768121314673073
