# Regression

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import pyspark

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('Live').getOrCreate()

In [4]:
spark

In [5]:
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Boston.csv')
df.to_csv('house')

In [6]:
house = spark.read.csv('/content/house', header=True, inferSchema=True)

In [7]:
house.show()

+---+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|_c0|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+---+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|  0|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|
|  1|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|
|  2|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|
|  3|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|
|  4|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|
|  5|           0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222.0|   18.7|394.12| 5.21|28.7|
|  6|           0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311.0|   15.2

In [8]:
house1 = spark.createDataFrame(df)

In [9]:
house1.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222.0|   18.7|394.12| 5.21|28.7|
|           0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311.0|   15.2| 395.6|12.43|22.9|
|           0.14455|

In [10]:
house1.columns

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'MEDV']

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
featureassembler = VectorAssembler(inputCols=['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT'], outputCol='Features')

In [13]:
output = featureassembler.transform(house1)

In [14]:
output.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            Features|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|[0.03236999999999...|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43|

In [15]:
modeldata = output.select('Features', 'MEDV')

In [16]:
modeldata.show()

+--------------------+----+
|            Features|MEDV|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03236999999999...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796000000000...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [17]:
train_data, test_data = modeldata.randomSplit([0.8,0.2])

In [18]:
train_data.show()

+--------------------+----+
|            Features|MEDV|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.0136,75.0,4.0,...|18.9|
|[0.01381,80.0,0.4...|50.0|
|[0.01432,100.0,1....|31.6|
|[0.01439,60.0,2.9...|29.1|
|[0.01778,95.0,1.4...|32.9|
|[0.01951,17.5,1.3...|33.0|
|[0.02055,85.0,0.7...|24.7|
|[0.02729,0.0,7.07...|34.7|
|[0.02731,0.0,7.07...|21.6|
|[0.02875,28.0,15....|25.0|
|[0.0315,95.0,1.47...|34.9|
|[0.03236999999999...|33.4|
|[0.03358999999999...|34.9|
|[0.03445,82.5,2.0...|24.1|
|[0.0351,95.0,2.68...|48.5|
|[0.03551,25.0,4.8...|22.9|
|[0.03584,80.0,3.3...|23.5|
|[0.03659,25.0,4.8...|24.8|
|[0.03768,80.0,1.5...|34.6|
+--------------------+----+
only showing top 20 rows



In [19]:
from pyspark.ml.regression import LinearRegression

In [22]:
reg = LinearRegression(featuresCol='Features', labelCol='MEDV')

In [23]:
reg = reg.fit(train_data)

In [24]:
reg.coefficients

DenseVector([-0.115, 0.0448, 0.0124, 1.4287, -20.0047, 3.6533, 0.009, -1.5517, 0.2973, -0.0111, -1.0154, 0.0084, -0.5342])

In [25]:
reg.intercept

39.76002475634623

In [26]:
reg.transform(test_data).show()

+--------------------+----+------------------+
|            Features|MEDV|        prediction|
+--------------------+----+------------------+
|[0.01311,90.0,1.2...|35.4|30.469900361721304|
|[0.02009,95.0,2.6...|50.0| 42.92028265896906|
|[0.02176999999999...|42.3|36.834151393644554|
|[0.02187,60.0,2.9...|31.1|31.854225620268096|
|[0.02763,75.0,2.9...|30.8| 31.00994197903757|
|[0.02985,0.0,2.18...|28.7|25.181963022688628|
|[0.04113,25.0,4.8...|28.0|28.175506603542487|
|[0.0456,0.0,13.89...|23.3|25.370267321966168|
|[0.05646,0.0,12.8...|21.2|21.350140608801382|
|[0.0566,0.0,3.41,...|23.6| 30.97798597634793|
|[0.0578,0.0,2.46,...|37.2| 32.95657813794401|
|[0.07013,0.0,13.8...|28.7|28.516109301723148|
|[0.07022,0.0,4.05...|23.2|25.727566438830863|
|[0.08014,0.0,5.96...|21.0| 23.00493910530949|
|[0.08221,22.0,5.8...|29.6|24.203022946270437|
|[0.08387,0.0,12.8...|20.3| 22.49088147552491|
|[0.09252,30.0,4.9...|23.3| 28.36510211319819|
|[0.10153,0.0,12.8...|20.0|23.287286305737528|
|[0.11747,12.

In [27]:
y_pred = reg.evaluate(test_data)

In [28]:
y_pred.predictions.show()

+--------------------+----+------------------+
|            Features|MEDV|        prediction|
+--------------------+----+------------------+
|[0.01311,90.0,1.2...|35.4|30.469900361721304|
|[0.02009,95.0,2.6...|50.0| 42.92028265896906|
|[0.02176999999999...|42.3|36.834151393644554|
|[0.02187,60.0,2.9...|31.1|31.854225620268096|
|[0.02763,75.0,2.9...|30.8| 31.00994197903757|
|[0.02985,0.0,2.18...|28.7|25.181963022688628|
|[0.04113,25.0,4.8...|28.0|28.175506603542487|
|[0.0456,0.0,13.89...|23.3|25.370267321966168|
|[0.05646,0.0,12.8...|21.2|21.350140608801382|
|[0.0566,0.0,3.41,...|23.6| 30.97798597634793|
|[0.0578,0.0,2.46,...|37.2| 32.95657813794401|
|[0.07013,0.0,13.8...|28.7|28.516109301723148|
|[0.07022,0.0,4.05...|23.2|25.727566438830863|
|[0.08014,0.0,5.96...|21.0| 23.00493910530949|
|[0.08221,22.0,5.8...|29.6|24.203022946270437|
|[0.08387,0.0,12.8...|20.3| 22.49088147552491|
|[0.09252,30.0,4.9...|23.3| 28.36510211319819|
|[0.10153,0.0,12.8...|20.0|23.287286305737528|
|[0.11747,12.

In [29]:
y_pred.meanAbsoluteError

3.1439378923322128

In [30]:
y_pred.rootMeanSquaredError

4.8782415935863686

In [31]:
y_pred.meanSquaredError

23.797241045396074

In [32]:
y_pred.r2

0.7160148929313794

In [33]:
y_pred.r2adj

0.6698673130327286

In [34]:
spark.stop()

# Classification

In [35]:
import pandas as pd
import pyspark

In [36]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('New').getOrCreate()

In [37]:
spark

In [38]:
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')

In [39]:
diabetes = spark.createDataFrame(df)

In [40]:
diabetes.show()

+-----------+-------+---------+-------+-------+----+------------------+---+--------+
|pregnancies|glucose|diastolic|triceps|insulin| bmi|               dpf|age|diabetes|
+-----------+-------+---------+-------+-------+----+------------------+---+--------+
|          6|    148|       72|     35|      0|33.6|             0.627| 50|       1|
|          1|     85|       66|     29|      0|26.6|             0.351| 31|       0|
|          8|    183|       64|      0|      0|23.3|             0.672| 32|       1|
|          1|     89|       66|     23|     94|28.1|0.1669999999999999| 21|       0|
|          0|    137|       40|     35|    168|43.1|2.2880000000000003| 33|       1|
|          5|    116|       74|      0|      0|25.6|             0.201| 30|       0|
|          3|     78|       50|     32|     88|31.0|             0.248| 26|       1|
|         10|    115|        0|      0|      0|35.3|             0.134| 29|       0|
|          2|    197|       70|     45|    543|30.5|             

In [41]:
from pyspark.ml.feature import VectorAssembler

In [42]:
diabetes.columns

['pregnancies',
 'glucose',
 'diastolic',
 'triceps',
 'insulin',
 'bmi',
 'dpf',
 'age',
 'diabetes']

In [43]:
featureassembler = VectorAssembler(inputCols=['pregnancies',
 'glucose',
 'diastolic',
 'triceps',
 'insulin',
 'bmi',
 'dpf',
 'age'], outputCol='IV')

In [None]:
#output = featureassembeler.transform(diabetes)
#output.show()

In [None]:
# modeldata = output.select('IV', 'diabetes')

In [45]:
modeldata = featureassembler.transform(diabetes).select('IV', 'diabetes')

In [46]:
modeldata.show()

+--------------------+--------+
|                  IV|diabetes|
+--------------------+--------+
|[6.0,148.0,72.0,3...|       1|
|[1.0,85.0,66.0,29...|       0|
|[8.0,183.0,64.0,0...|       1|
|[1.0,89.0,66.0,23...|       0|
|[0.0,137.0,40.0,3...|       1|
|[5.0,116.0,74.0,0...|       0|
|[3.0,78.0,50.0,32...|       1|
|[10.0,115.0,0.0,0...|       0|
|[2.0,197.0,70.0,4...|       1|
|[8.0,125.0,96.0,0...|       1|
|[4.0,110.0,92.0,0...|       0|
|[10.0,168.0,74.0,...|       1|
|[10.0,139.0,80.0,...|       0|
|[1.0,189.0,60.0,2...|       1|
|[5.0,166.0,72.0,1...|       1|
|[7.0,100.0,0.0,0....|       1|
|[0.0,118.0,84.0,4...|       1|
|[7.0,107.0,74.0,0...|       1|
|[1.0,103.0,30.0,3...|       0|
|[1.0,115.0,70.0,3...|       1|
+--------------------+--------+
only showing top 20 rows



In [47]:
train_data, test_data = modeldata.randomSplit([0.8,0.2])

In [48]:
train_data.show()

+--------------------+--------+
|                  IV|diabetes|
+--------------------+--------+
|(8,[0,1,6,7],[2.0...|       0|
|(8,[0,1,6,7],[2.0...|       0|
|(8,[0,1,6,7],[7.0...|       0|
|(8,[1,5,6,7],[131...|       1|
|(8,[1,5,6,7],[138...|       1|
|(8,[1,5,6,7],[167...|       1|
|[0.0,78.0,88.0,29...|       0|
|[0.0,84.0,64.0,22...|       0|
|[0.0,93.0,60.0,25...|       0|
|[0.0,93.0,100.0,3...|       0|
|[0.0,95.0,85.0,25...|       1|
|[0.0,98.0,82.0,15...|       0|
|[0.0,100.0,70.0,2...|       0|
|[0.0,100.0,88.0,6...|       0|
|[0.0,101.0,65.0,2...|       0|
|[0.0,101.0,76.0,0...|       0|
|[0.0,102.0,52.0,0...|       0|
|[0.0,102.0,75.0,2...|       0|
|[0.0,104.0,76.0,0...|       0|
|[0.0,105.0,64.0,4...|       0|
+--------------------+--------+
only showing top 20 rows



In [50]:
from pyspark.ml.classification import LogisticRegression

In [51]:
lr = LogisticRegression(featuresCol='IV', labelCol='diabetes')

In [53]:
lr = lr.fit(train_data)

In [54]:
lr.coefficients

DenseVector([0.1262, 0.0342, -0.0154, 0.0048, -0.0006, 0.0749, 1.0403, 0.0165])

In [55]:
lr.intercept

-7.896858062023409

In [56]:
y_pred = lr.transform(test_data)

In [57]:
y_pred.show()

+--------------------+--------+--------------------+--------------------+----------+
|                  IV|diabetes|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[117...|       0|[-0.3299791199246...|[0.41824570361080...|       1.0|
|[0.0,86.0,68.0,32...|       0|[2.51298748949373...|[0.92504729071394...|       0.0|
|[0.0,101.0,64.0,1...|       0|[3.16978293434668...|[0.95968118644907...|       0.0|
|[0.0,104.0,64.0,2...|       0|[2.36102988256342...|[0.91380695754787...|       0.0|
|[0.0,106.0,70.0,3...|       0|[1.33014466175568...|[0.79086456253074...|       0.0|
|[0.0,113.0,76.0,0...|       1|[2.04455665761291...|[0.88539644268641...|       0.0|
|[0.0,118.0,64.0,2...|       0|[2.65136762666463...|[0.93409523342312...|       0.0|
|[0.0,124.0,70.0,2...|       1|[1.73262489848362...|[0.84974786568679...|       0.0|
|[0.0,128.0,68.0,1...|       1|[0.45273628638545...|[0.6112896132

In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [63]:
evaluator = MulticlassClassificationEvaluator(labelCol='diabetes', predictionCol='prediction', metricName='weightedPrecision')

In [64]:
evaluator.evaluate(y_pred)

0.7396918913467356