<a href="https://colab.research.google.com/github/Alexis2411/Hadoop/blob/main/Tema3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


In [4]:
path_csv="/content/BostonHousing.csv"
dataset = spark.read.csv(path_csv,inferSchema=True, header =True)
dataset.printSchema()


root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- black: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
'ptratio', 'black', 'lstat'], outputCol = 'Attributes')
output = assembler.transform(dataset)
#Input vs Output
finalized_data = output.select("Attributes","medv")
finalized_data.show(10)

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
+--------------------+----+
only showing top 10 rows



In [6]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])

In [7]:
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')

In [8]:
#Learn to fit the model from training set
regressor = regressor.fit(train_data)

In [9]:
print ("The coefficient of the model is : %a", regressor.coefficients)
print ("The Intercept of the model is : %f", regressor.intercept)

The coefficient of the model is : %a [-0.08711994619844514,0.04861971218500262,-0.05759433901827429,2.5713706382847583,-17.02314754770964,3.910918394645101,-0.0075735672376323815,-1.5280484698696348,0.25586567281056327,-0.011408489505473018,-0.9235376699849972,0.009222277961547823,-0.4337458539920746]
The Intercept of the model is : %f 35.375518766402166


In [10]:
#To predict the prices on testing set
Pred_lr = regressor.evaluate(test_data)

In [11]:
#Predict the model
Pred_lr.predictions.show(10)

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00632,18.0,2.3...|24.0|30.178494501481232|
|[0.01311,90.0,1.2...|35.4|31.449861561136597|
|[0.01432,100.0,1....|31.6| 33.11656978216796|
|[0.01501,90.0,1.2...|50.0| 44.94503216186548|
|[0.01538,90.0,3.7...|44.0|37.551617745464156|
|[0.0187,85.0,4.15...|23.1|25.544256187537364|
|[0.01965,80.0,1.7...|20.1|21.085852437845507|
|[0.02498,0.0,1.89...|16.5|22.597288186347527|
|[0.02543,55.0,3.7...|23.9| 27.94186235253342|
|[0.02729,0.0,7.07...|34.7|30.216025258562112|
+--------------------+----+------------------+
only showing top 10 rows



In [12]:
#Carry out random forrest regression
from pyspark.ml.regression import RandomForestRegressor
rfr = RandomForestRegressor(featuresCol = 'Attributes', labelCol = 'medv')

#Learn to fit the model from training set
rfr = rfr.fit(train_data)

#To predict the prices on testing set
pred_rfr = rfr.transform(test_data)

In [13]:
# Select example rows to display.
pred_rfr.select('Attributes', 'medv', 'prediction').show(10)

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00632,18.0,2.3...|24.0|29.790627902089863|
|[0.01311,90.0,1.2...|35.4| 35.37574369167835|
|[0.01432,100.0,1....|31.6|31.279768340021604|
|[0.01501,90.0,1.2...|50.0| 45.13964285714285|
|[0.01538,90.0,3.7...|44.0| 42.32456197796723|
|[0.0187,85.0,4.15...|23.1| 25.36739552769233|
|[0.01965,80.0,1.7...|20.1|22.531344766119464|
|[0.02498,0.0,1.89...|16.5| 24.38529087227399|
|[0.02543,55.0,3.7...|23.9|25.654666190055877|
|[0.02729,0.0,7.07...|34.7|36.035065858546574|
+--------------------+----+------------------+
only showing top 10 rows



In [14]:
#Carry out Gradient-boosted tree regression
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'Attributes', labelCol = 'medv')

#Learn to fit the model from training set
gbt = gbt.fit(train_data)

#To predict the prices on testing set
pred_gbt = gbt.transform(test_data)

# Select example rows to display.
pred_gbt.select('Attributes', 'medv', 'prediction').show(10)

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00632,18.0,2.3...|24.0|  33.4022640112858|
|[0.01311,90.0,1.2...|35.4| 34.16448146365419|
|[0.01432,100.0,1....|31.6| 31.01975861320538|
|[0.01501,90.0,1.2...|50.0|46.597097765081344|
|[0.01538,90.0,3.7...|44.0|47.040137156620396|
|[0.0187,85.0,4.15...|23.1| 23.37727593023562|
|[0.01965,80.0,1.7...|20.1| 21.76711687554088|
|[0.02498,0.0,1.89...|16.5|27.195856671683668|
|[0.02543,55.0,3.7...|23.9|24.011048810888507|
|[0.02729,0.0,7.07...|34.7|29.332608284506346|
+--------------------+----+------------------+
only showing top 10 rows



In [15]:
import numpy as np
print ("Note: the last rows are the information for Intercept")
print ("##","-------------------------------------------------")
print ("##"," Estimate | Std.Error | t Values | P-value")
coef = np.append(list(regressor.coefficients),regressor.intercept)
Summary=regressor.summary

for i in range(len(Summary.pValues)):
  print ("##",'{:10.6f}'.format(coef[i]),\
  '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
  '{:8.3f}'.format(Summary.tValues[i]),\
  '{:10.6f}'.format(Summary.pValues[i]))

print ("##",'---')
print ("##","Mean squared error: % .6f" \
  % Summary.meanSquaredError, ", \
  RMSE: % .6f" \
  % Summary.rootMeanSquaredError )
print ("##","Multiple R-squared: %f" % Summary.r2, ", \
  Total iterations: %i"% Summary.totalIterations)


Note: the last rows are the information for Intercept
## -------------------------------------------------
##  Estimate | Std.Error | t Values | P-value
##  -0.087120   0.037097   -2.348   0.019386
##   0.048620   0.015775    3.082   0.002212
##  -0.057594   0.065561   -0.878   0.380260
##   2.571371   0.962102    2.673   0.007864
## -17.023148   4.254187   -4.002   0.000076
##   3.910918   0.454857    8.598   0.000000
##  -0.007574   0.014085   -0.538   0.591104
##  -1.528048   0.214091   -7.137   0.000000
##   0.255866   0.070569    3.626   0.000329
##  -0.011408   0.004040   -2.824   0.005006
##  -0.923538   0.143775   -6.423   0.000000
##   0.009222   0.002864    3.220   0.001397
##  -0.433746   0.055517   -7.813   0.000000
##  35.375519   5.575200    6.345   0.000000
## ---
## Mean squared error:  19.846603 ,   RMSE:  4.454953
## Multiple R-squared: 0.754471 ,   Total iterations: 1


In [16]:
from pyspark.ml.evaluation import RegressionEvaluator
eval_lr = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")

print("Linear regression model\n")

# Root Mean Square Error
rmse = eval_lr.evaluate(Pred_lr.predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval_lr.evaluate(Pred_lr.predictions, {eval_lr.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval_lr.evaluate(Pred_lr.predictions, {eval_lr.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval_lr.evaluate(Pred_lr.predictions, {eval_lr.metricName: "r2"})
print("r2: %.3f" %r2)

Linear regression model

RMSE: 5.410
MSE: 29.270
MAE: 3.382
r2: 0.692


In [17]:
eval_rfr = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")
print("Regression Forrest model\n")

# Root Mean Square Error
rmse = eval_rfr.evaluate(pred_rfr)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval_rfr.evaluate(pred_rfr, {eval_rfr.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval_rfr.evaluate(pred_rfr, {eval_rfr.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval_rfr.evaluate(pred_rfr, {eval_rfr.metricName: "r2"})
print("r2: %.3f" %r2)

Regression Forrest model

RMSE: 4.708
MSE: 22.166
MAE: 2.836
r2: 0.767


In [18]:
eval_gbt = RegressionEvaluator(labelCol="medv", predictionCol="prediction",
metricName="rmse")
print("Gradient Boot Tree model\n")

# Root Mean Square Error
rmse = eval_gbt.evaluate(pred_gbt)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval_gbt.evaluate(pred_gbt, {eval_gbt.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval_gbt.evaluate(pred_gbt, {eval_gbt.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval_gbt.evaluate(pred_gbt, {eval_gbt.metricName: "r2"})
print("r2: %.3f" %r2)

Gradient Boot Tree model

RMSE: 4.882
MSE: 23.832
MAE: 3.051
r2: 0.749
