In [176]:
%reset -f

In [190]:
!pip install pyspark
!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.25.1-py3-none-any.whl (16.8 MB)
[K     |████████████████████████████████| 16.8 MB 603 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.1 MB/s 
[?25hCollecting docker>=4.0.0
  Downloading docker-5.0.3-py2.py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 49.3 MB/s 
Collecting databricks-cli>=0.8.7
  Downloading databricks-cli-0.16.6.tar.gz (62 kB)
[K     |████████████████████████████████| 62 kB 665 kB/s 
Collecting prometheus-flask-exporter
  Downloading prometheus_flask_exporter-0.20.1-py3-none-any.whl (18 kB)
Collecting alembic
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 62.3 MB/s 
Collecting querystring-parser
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting 

In [178]:
nafrom pyspark.sql import SparkSession
from pyspark.sql.functions import count
spark = (SparkSession.builder.appName("ML-Regression").getOrCreate())

In [179]:
data = (spark.read.format("csv")
            .option("header",True)
            .load("/content/drive/MyDrive/RTA/Pyspark/data/Housing.csv"))

# data = data.drop("ocean_proximity","longitude","latitude")

In [180]:
data = data.dropna()

In [182]:
data.createOrReplaceTempView("data")

# One Hot Encoding

In [183]:
feature_data = spark.sql("""
select 
cast(housing_median_age as float) housing_median_age,
cast(total_rooms as float) total_rooms,
cast(total_bedrooms as float) total_bedrooms,
cast(population as float) population,
cast(households as float) households,
cast(median_income as float) median_income,
cast(median_house_value as float) median_house_value,
ocean_proximity
from data 
""")

In [184]:
feature_data.columns

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [185]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

indexer = StringIndexer(inputCol="ocean_proximity", outputCol="ocean_proximity_index")
encoded_data = indexer.fit(feature_data).transform(feature_data)
ohe = OneHotEncoder(inputCol="ocean_proximity_index", outputCol="OHEVector")
ohe_data = ohe.fit(encoded_data).transform(encoded_data)


In [186]:
ohe_data.show(10)

+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------+
|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|    OHEVector|
+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+-------------+
|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|                  3.0|(4,[3],[1.0])|
|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|                  3.0|(4,[3],[1.0])|
|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|                  3.0|(4,[3],[1.0])|
|              52.0|     1274.0|         235.0|     558.0|     219.0| 

In [187]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'OHEVector'],outputCol="features")
vecAssDF = vecAssembler.transform(ohe_data)

In [188]:
vecAssDF.select("features","median_house_value").show(10)

+--------------------+------------------+
|            features|median_house_value|
+--------------------+------------------+
|[41.0,880.0,129.0...|          452600.0|
|[21.0,7099.0,1106...|          358500.0|
|[52.0,1467.0,190....|          352100.0|
|[52.0,1274.0,235....|          341300.0|
|[52.0,1627.0,280....|          342200.0|
|[52.0,919.0,213.0...|          269700.0|
|[52.0,2535.0,489....|          299200.0|
|[52.0,3104.0,687....|          241400.0|
|[42.0,2555.0,665....|          226700.0|
|[52.0,3549.0,707....|          261100.0|
+--------------------+------------------+
only showing top 10 rows



In [189]:
from pyspark.ml.regression import LinearRegression
reg = LinearRegression(featuresCol="features",labelCol="median_house_value")

dtModel = reg.fit(vecAssDF)

m = round(dtModel.coefficients[0], 2) 
b = round(dtModel.intercept, 2) 
print(f"""The formula for the linear regression line is x = {m}*y + {b}""")

The formula for the linear regression line is x = 1185.09*y + 200262.43


# Pipeline

In [None]:
lr = LinearRegression(labelCol="price", featuresCol="features") 
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr]) 
# Or use RFormula 
# pipeline = Pipeline(stages = [rFormula, lr]) 
pipelineModel = pipeline.fit(trainDF) 
predDF = pipelineModel.transform(testDF) 
predDF.select("features", "price", "prediction").show(5)

# Evaluation

## RMSE

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator 
regressionEvaluator = RegressionEvaluator( predictionCol="prediction", 
                                          labelCol="price", 
                                          metricName="rmse") 
rmse = regressionEvaluator.evaluate(predDF) 
print(f"RMSE is {rmse:.1f}")

## R-square

In [None]:
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) 
print(f"R2 is {r2}")

# Saving and Loading Models

In [None]:
pipelinePath = "/tmp/lr-pipeline-model" 
pipelineModel.write().overwrite().save(pipelinePath)

from pyspark.ml import PipelineModel 
savedPipelineModel = PipelineModel.load(pipelinePath)