In [None]:
# instalar la biblioteca PySpark.
!pip install pyspark



In [None]:
# Crear una sesion en PySpark.
from pyspark.sql import SparkSession
# Construir una nueva sesión
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark
# master("local") -> Ejecutarse de manera local
# appName("Colab")-> Establece un nombre
# config('spark.ui.port', '4050') -> Configuración del puerto para la interfaz de usuario de Spark
# getOrCreate() -> Crea la sesión o devuelve una existente

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Leer un archivo CSV
df = spark.read.format("csv").load("/content/drive/MyDrive/housing.csv", header=True, inferSchema=True)

# Descripción de la estructura de los datos
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [None]:
# Visualizar las primeras 5 primeras filas del DataFrame
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  -122.25|   37.85|              

In [None]:
# Libreria para generar un numero unico para cada fila
from pyspark.sql.functions import monotonically_increasing_id

# Añadir una nueva columna llamada id
df = df.withColumn('id', monotonically_increasing_id())

# Reordenar las columnas
df = df[['id'] + df.columns[:-1]]

# Visualizar las primeras 3 primeras filas del DataFrame
df.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  0|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  2|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
only s

In [None]:
# Número total de filas
df.count()

20640

In [None]:
# Descripción de la estructura de los datos
df.printSchema()

root
 |-- id: long (nullable = false)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [None]:
# Asignar las columnas del DataFrame a numerical_features_lst
numerical_features_lst = df.columns

# Eliminar las columnas irrelevantes
numerical_features_lst.remove('median_house_value') # -> Variable objetivo
numerical_features_lst.remove('id')
numerical_features_lst.remove('ocean_proximity')
numerical_features_lst.remove('longitude')
numerical_features_lst.remove('latitude')

# Columnas para el analisis
numerical_features_lst

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [None]:
# Importar funciones mean, sum, avg, max y count
from pyspark.sql.functions import mean, sum,avg,max,count

# Calcular el promedio de cada columna de numerical_features_lst
df.select(*[mean(c) for c in numerical_features_lst]).show()

+-----------------------+------------------+-------------------+------------------+-----------------+------------------+
|avg(housing_median_age)|  avg(total_rooms)|avg(total_bedrooms)|   avg(population)|  avg(households)|avg(median_income)|
+-----------------------+------------------+-------------------+------------------+-----------------+------------------+
|     28.639486434108527|2635.7630813953488|  537.8705525375618|1425.4767441860465|499.5396802325581|3.8706710029070246|
+-----------------------+------------------+-------------------+------------------+-----------------+------------------+



In [None]:
# La suma total de las columnas agrupados por ocean_proximity
df.groupBy("ocean_proximity").agg({col: 'sum' for col in numerical_features_lst}).show(truncate=False)

+---------------+---------------+---------------+-------------------+------------------+----------------+-----------------------+
|ocean_proximity|sum(households)|sum(population)|sum(total_bedrooms)|sum(median_income)|sum(total_rooms)|sum(housing_median_age)|
+---------------+---------------+---------------+-------------------+------------------+----------------+-----------------------+
|ISLAND         |1383.0         |3340.0         |2102.0             |13.722100000000001|7873.0          |212.0                  |
|NEAR OCEAN     |1332308.0      |3598955.0      |1415482.0          |10647.376000000002|6867477.0       |78005.0                |
|NEAR BAY       |1118931.0      |2817427.0      |1167195.0          |9555.906100000018 |5710320.0       |86402.0                |
|<1H OCEAN      |4730118.0      |1.3889374E7    |4937435.0          |38651.509999999995|2.4012547E7     |267495.0               |
|INLAND         |3127759.0      |9112744.0      |3468095.0          |21022.135299999973|1.

In [None]:
# El promedio total de las columnas agrupados por ocean_proximity
df.groupby('ocean_proximity').agg({col: 'avg' for col in numerical_features_lst}).show()

+---------------+------------------+------------------+-------------------+------------------+------------------+-----------------------+
|ocean_proximity|   avg(households)|   avg(population)|avg(total_bedrooms)|avg(median_income)|  avg(total_rooms)|avg(housing_median_age)|
+---------------+------------------+------------------+-------------------+------------------+------------------+-----------------------+
|         ISLAND|             276.6|             668.0|              420.4|2.7444200000000003|            1574.6|                   42.4|
|     NEAR OCEAN|501.24454477050415|1354.0086531226486|  538.6156773211568| 4.005784800601957| 2583.700902934537|     29.347253574115875|
|       NEAR BAY| 488.6161572052402|1230.3174672489083|  514.1828193832599| 4.172884759825336| 2493.589519650655|      37.73013100436681|
|      <1H OCEAN| 517.7449649737302|1520.2904991243433|  546.5391852999778|4.2306819176882655|2628.3435858143607|     29.279225043782837|
|         INLAND|477.4475652572126

In [None]:
# El maximo valor de las columnas agrupados por ocean_proximity
df.groupby('ocean_proximity').agg({col: 'max' for col in numerical_features_lst}).show()

+---------------+---------------+---------------+-------------------+------------------+----------------+-----------------------+
|ocean_proximity|max(households)|max(population)|max(total_bedrooms)|max(median_income)|max(total_rooms)|max(housing_median_age)|
+---------------+---------------+---------------+-------------------+------------------+----------------+-----------------------+
|         ISLAND|          431.0|         1100.0|              591.0|            3.3906|          2359.0|                   52.0|
|     NEAR OCEAN|         4176.0|        12873.0|             4585.0|           15.0001|         30405.0|                   52.0|
|       NEAR BAY|         3589.0|         8276.0|             3226.0|           15.0001|         18634.0|                   52.0|
|      <1H OCEAN|         6082.0|        35682.0|             6445.0|           15.0001|         37937.0|                   52.0|
|         INLAND|         5358.0|        16305.0|             6210.0|           15.0001|  

In [None]:
# El número total de columnas agrupados por ocean_proximity
df.groupby('ocean_proximity').agg(count("*").alias("count")).show()

+---------------+-----+
|ocean_proximity|count|
+---------------+-----+
|         ISLAND|    5|
|     NEAR OCEAN| 2658|
|       NEAR BAY| 2290|
|      <1H OCEAN| 9136|
|         INLAND| 6551|
+---------------+-----+



In [None]:
# Dividir los datos para entrenamiento y testeo
train, test = df.randomSplit([0.7, 0.3])

train, test

(DataFrame[id: bigint, longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, ocean_proximity: string],
 DataFrame[id: bigint, longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, ocean_proximity: string])

In [None]:
# Asignar las columnas del Data de entrenamiento a numerical_features_lst_train
numerical_features_lst_train = train.columns

# Eliminar las columnas irrelevantes
numerical_features_lst_train.remove('median_house_value')
numerical_features_lst_train.remove('id')
numerical_features_lst_train.remove('ocean_proximity')

# Columnas finales
numerical_features_lst_train

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [None]:
# Importar el Imputador de PySpark
from pyspark.ml.feature import Imputer

# Crear la instancia Imputer
imputer = Imputer(inputCols=numerical_features_lst_train, outputCols=numerical_features_lst_train) # inputCols -> Columnas a procesar     outputCols -> Columnas de salida

# Aprender los valores que deben ser utilizados para remplazar los calores faltantes
imputer = imputer.fit(train)

# Aplicar el imputador al conjunto de entrenamiento y testeo
train = imputer.transform(train)
test = imputer.transform(test)

# Visualizar las primeras 3 primeras filas del DataFrame
train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  3|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  6|  -122.25|   37.84|              52.0|     2535.0|         489.0|    1094.0|     514.0|       3.6591|          299200.0|       NEAR BAY|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
only s

In [None]:
# Importar VectorAssembler de PySpark
from pyspark.ml.feature import VectorAssembler

# Crear la instancia VectorAssembler
numerical_vector_assembler = VectorAssembler(inputCols=numerical_features_lst_train, outputCol='numerical_feature_vector') # inputCols -> Columnas a procesar     outputCols -> Nueva columna de salida que contiene los vectores de las características numéricas.

# Aplicar el imputador al conjunto de entrenamiento y testeo
train = numerical_vector_assembler.transform(train)
test = numerical_vector_assembler.transform(test)

train.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|
|  3|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|    [-122.25,37.85,52...|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------

In [None]:
# Mostar las 2 primeras filas de la columna numerical_feature_vector en forma de lista (train)
train.select('numerical_feature_vector').take(2)

[Row(numerical_feature_vector=DenseVector([-122.22, 37.86, 21.0, 7099.0, 1106.0, 2401.0, 1138.0, 8.3014])),
 Row(numerical_feature_vector=DenseVector([-122.25, 37.85, 52.0, 1274.0, 235.0, 558.0, 219.0, 5.6431]))]

In [None]:
# Importar StandardScaler de PySpark
from pyspark.ml.feature import StandardScaler

# Crear la instancia StandardScaler
scaler = StandardScaler(inputCol='numerical_feature_vector',outputCol='scaled_numerical_feature_vector',withStd=True, withMean=True) # inputCols -> Columna a procesar     outputCols -> Nueva columna de salida que contiene los vectores de características estandarizados.

# Calcula la media y la desviación estándar de cada característica en la columna
scaler = scaler.fit(train)

# Aplicar scaler al conjunto de entrenamiento y testeo
train = scaler.transform(train)
test = scaler.transform(test)

# Visualizar las primeras 3 primeras filas del DataFrame
train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3273303543486...|
|  3|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|    [-122.25,37.85,52...|           [-1.3423140725279...|
|  6|

In [None]:
# Mostar las 3 primeras filas de la columna scaled_numerical_feature_vector en forma de lista (train)
train.select('scaled_numerical_feature_vector').take(3)

[Row(scaled_numerical_feature_vector=DenseVector([-1.3273, 1.0479, -0.601, 2.0658, 1.3462, 0.8511, 1.6582, 2.3439])),
 Row(scaled_numerical_feature_vector=DenseVector([-1.3423, 1.0433, 1.8683, -0.6296, -0.7234, -0.7669, -0.7344, 0.9386])),
 Row(scaled_numerical_feature_vector=DenseVector([-1.3423, 1.0386, 1.8683, -0.0461, -0.1199, -0.2963, 0.0337, -0.1103]))]

In [None]:
# Importar StringIndexer de PySpark
from pyspark.ml.feature import StringIndexer

# Crear la instancia StringIndexer
indexer = StringIndexer(inputCol='ocean_proximity',outputCol='ocean_category_index') # inputCols -> Columna a procesar     outputCols -> Nueva columna de salida que contiene los índices numéricos correspondientes a ocean_proximity

# Determinar los indices para ocean_proximity
indexer = indexer.fit(train)

# Aplicar indexer al conjunto de entrenamiento y testeo
train = indexer.transform(train)
test = indexer.transform(test)

# Visualizar las primeras 3 primeras filas del DataFrame
train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3273303543486...|                 3.0|
|  3|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          3413

In [None]:
# Obtener los valores unicos de ocean_category_index
set(train.select('ocean_category_index').collect())

{Row(ocean_category_index=0.0),
 Row(ocean_category_index=1.0),
 Row(ocean_category_index=2.0),
 Row(ocean_category_index=3.0),
 Row(ocean_category_index=4.0)}

In [None]:
# Importar OneHotEncoder de PySpark
from pyspark.ml.feature import OneHotEncoder

# Crear la instancia OneHotEncoder
one_hot_encoder = OneHotEncoder(inputCol='ocean_category_index', outputCol='ocean_category_one_hot') # inputCols -> Columna a procesar     outputCols -> Nueva columna que almacenará EL vector categorico

# Identificar el número de categorías únicas en la columna 'ocean_category_index'
one_hot_encoder = one_hot_encoder.fit(train)

# Aplicar one_hot_encoder al conjunto de entrenamiento y testeo
train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)

# Visualizar las primeras 3 primeras filas del DataFrame
train.show(3)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3273303543486...|                 3.0|         (4,[3],[1.0])|
|  3|  -122.25|   37.85|    

In [None]:
# Crear la instancia VectorAssembler
assembler = VectorAssembler(inputCols=['scaled_numerical_feature_vector', 'ocean_category_one_hot'], outputCol='final_feature_vector') # inputCols -> Columnas a procesar     outputCols -> Nueva columna que almacenará el vector combinado de características

# Aplicar VectorAssembler al conjunto de entrenamiento y testeo
train = assembler.transform(train)
test = assembler.transform(test)

In [None]:
# Visualizar las primeras 2 primeras filas del DataFrame
train.show(2)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|final_feature_vector|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|    [-122.22,37.86,21...|           [-1.3273303543486...|          

In [None]:
# Mostar las 2 primeras filas de la columna final_feature_vector en forma de lista (train)
train.select('final_feature_vector').take(2)

[Row(final_feature_vector=DenseVector([-1.3273, 1.0479, -0.601, 2.0658, 1.3462, 0.8511, 1.6582, 2.3439, 0.0, 0.0, 0.0, 1.0])),
 Row(final_feature_vector=DenseVector([-1.3423, 1.0433, 1.8683, -0.6296, -0.7234, -0.7669, -0.7344, 0.9386, 0.0, 0.0, 0.0, 1.0]))]

In [None]:
# Importar LinearRegression de PySpark
from pyspark.ml.regression import LinearRegression

# Crear la instancia LinearRegression
lr = LinearRegression(featuresCol='final_feature_vector', labelCol='median_house_value')  # featuresCol -> Columna que contiene las caracteristicas     labelCol -> la variable objetivo

lr

LinearRegression_282f6fa59545

In [None]:
# Entrenar el modelo de regresión con el conjunto de datos train
lr = lr.fit(train)

lr

LinearRegressionModel: uid=LinearRegression_282f6fa59545, numFeatures=12

In [None]:
# Aplicar modelo de regresión lineal entrenado al conjunto de datos de entrenamiento y luego renombrando la columna de predicciones
pred_train_df = lr.transform(train).withColumnRenamed('prediction', 'predicted_median_house_value')

# Visualizar las primeras 5 primeras filas del DataFrame pred_train_df
pred_train_df.show(5)

+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+----------------------------+
| id|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|numerical_feature_vector|scaled_numerical_feature_vector|ocean_category_index|ocean_category_one_hot|final_feature_vector|predicted_median_house_value|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------------------+-------------------------------+--------------------+----------------------+--------------------+----------------------------+
|  1|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          35850

In [None]:
# Aplicar modelo de regresión lineal entrenado al conjunto de datos de testeo y luego renombrando la columna de predicciones
pred_test_df = lr.transform(test).withColumnRenamed('prediction', 'predicted_median_house_value')

# Visualizar las primeras 5 primeras filas del DataFrame pred_test_df
pred_test_df.show(5)

In [None]:
# Convertir el DataFrame de PySpark a un DataFrame de Pandas
pred_test_pd_df = pred_test_df.toPandas()

# Visualizar las primeras 2 primeras filas del DataFrame pred_test_pd_df
pred_test_pd_df.head(2)

In [None]:
# Un nuevo DataFrame que solo contenga predicted_median_house_value y median_house_value
predictions_and_actuals = pred_test_df[['predicted_median_house_value', 'median_house_value']]

# Convierte el DataFrame de PySpark en un RDD
predictions_and_actuals_rdd = predictions_and_actuals.rdd

# Visualizar los primeros 2 elemetos de predictions_and_actuals_rdd
predictions_and_actuals_rdd.take(2)

[Row(predicted_median_house_value=406873.1620247062, median_house_value=452600.0),
 Row(predicted_median_house_value=377761.12949848734, median_house_value=352100.0)]

In [None]:
# Convertir cada fila a una tupla
predictions_and_actuals_rdd = predictions_and_actuals_rdd.map(tuple)

# Visualizar los primeros 2 elemetos de predictions_and_actuals_rdd
predictions_and_actuals_rdd.take(2)

[(406873.1620247062, 452600.0), (377761.12949848734, 352100.0)]

In [None]:
# Importar RegressionMetrics de PySpark
from pyspark.mllib.evaluation import RegressionMetrics

# Crear la instancia RegressionMetrics
metrics = RegressionMetrics(predictions_and_actuals_rdd)

# MSE (Error cuadratico medio), RMSE (Raiz del error cuadratico medio), MAE (Valor absoluto de errores), R^2 (Coeficiente de Determinación)
s = '''
Mean Squared Error:      {0}
Root Mean Squared Error: {1}
Mean Absolute Error:     {2}
R**2:                    {3}
'''.format(metrics.meanSquaredError,
           metrics.rootMeanSquaredError,
           metrics.meanAbsoluteError,
           metrics.r2
           )

print(s)




Mean Squared Error:      4772429552.793096
Root Mean Squared Error: 69082.7731984834
Mean Absolute Error:     50406.46319911228
R**2:                    0.6512778286155831

