In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pip install pyspark 

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 71kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 19.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=cf8ec49f03730d20be631f33f7ff989c3be620580058c2f1b0adb90b20b11266
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PCA').getOrCreate()
data=spark.read.csv('ConcreteStrengthData.csv', header=True, inferSchema=True)
data.printSchema()

root
 |-- CementComponent : double (nullable = true)
 |-- BlastFurnaceSlag: double (nullable = true)
 |-- FlyAshComponent: double (nullable = true)
 |-- WaterComponent: double (nullable = true)
 |-- SuperplasticizerComponent: double (nullable = true)
 |-- CoarseAggregateComponent: double (nullable = true)
 |-- FineAggregateComponent: double (nullable = true)
 |-- AgeInDays: double (nullable = true)
 |-- Strength: double (nullable = true)



In [3]:
data.show()

+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+
|CementComponent |BlastFurnaceSlag|FlyAshComponent|WaterComponent|SuperplasticizerComponent|CoarseAggregateComponent|FineAggregateComponent|AgeInDays|Strength|
+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1040.0|                 676.0|     28.0|   79.99|
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1055.0|                 676.0|     28.0|   61.89|
|           332.5|           142.5|            0.0|         228.0|                      0.0|                   932.0|                 594.0|    270.0|   40.27|
|           332.5|           142.5|     

In [6]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=['CementComponent ','BlastFurnaceSlag','FlyAshComponent','WaterComponent','SuperplasticizerComponent','CoarseAggregateComponent','FineAggregateComponent','AgeInDays','Strength'], outputCol='features')
assembled_data=assemble.transform(data)
assembled_data.show(2)

+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+--------------------+
|CementComponent |BlastFurnaceSlag|FlyAshComponent|WaterComponent|SuperplasticizerComponent|CoarseAggregateComponent|FineAggregateComponent|AgeInDays|Strength|            features|
+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+--------------------+
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1040.0|                 676.0|     28.0|   79.99|[540.0,0.0,0.0,16...|
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1055.0|                 676.0|     28.0|   61.89|[540.0,0.0,0.0,16...|
+----------------+----------------+---------------+--------------+-------------------------+---

In [7]:
from pyspark.ml.feature import MinMaxScaler

scale=MinMaxScaler(inputCol='features',outputCol='MinMaxScaler')
data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)
data_scale_output.show(2)


+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+--------------------+--------------------+
|CementComponent |BlastFurnaceSlag|FlyAshComponent|WaterComponent|SuperplasticizerComponent|CoarseAggregateComponent|FineAggregateComponent|AgeInDays|Strength|            features|        MinMaxScaler|
+----------------+----------------+---------------+--------------+-------------------------+------------------------+----------------------+---------+--------+--------------------+--------------------+
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1040.0|                 676.0|     28.0|   79.99|[540.0,0.0,0.0,16...|[1.0,0.0,0.0,0.32...|
|           540.0|             0.0|            0.0|         162.0|                      2.5|                  1055.0|                 676.0|     28.0|   61.89|[540.0,0.0,0.0,16...|[1.0,0.0,0.0

In [8]:
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(data_scale_output)
result = model.transform(data_scale_output).select("pcaFeatures")
result.show(truncate=False)

+-----------------------------------------------------------+
|pcaFeatures                                                |
+-----------------------------------------------------------+
|[-340.644902306898,559.1228085263207,302.41108647246415]   |
|[-339.2361615387661,563.9896344872608,314.1744042985384]   |
|[-157.4385271692405,364.7297083972154,324.0803691269247]   |
|[-167.02060727966656,358.04801162360474,335.2479045997004] |
|[1.7233439399420871,488.21231800992183,247.58127159926562] |
|[-70.81756657684984,435.0073133845725,268.3379001345956]   |
|[-222.25647812792545,394.31491888493764,331.36613454685875]|
|[-187.96398363709562,418.19704415525047,291.8412800183024] |
|[-64.51890111917302,439.3948919270922,261.0631595523822]   |
|[-298.2703776866586,490.82945788525467,284.1274047423096]  |
|[29.22636661750501,507.3632548726947,215.9229817203692]    |
|[36.121224607087505,512.1063516953332,208.827317170414]    |
|[-267.7381997697404,437.3661302250789,316.3685127464478]   |
|[17.590

In [10]:
result.show()

+--------------------+
|         pcaFeatures|
+--------------------+
|[-340.64490230689...|
|[-339.23616153876...|
|[-157.43852716924...|
|[-167.02060727966...|
|[1.72334393994208...|
|[-70.817566576849...|
|[-222.25647812792...|
|[-187.96398363709...|
|[-64.518901119173...|
|[-298.27037768665...|
|[29.2263666175050...|
|[36.1212246070875...|
|[-267.73819976974...|
|[17.5907454460550...|
|[-108.69684347927...|
|[-203.35139365716...|
|[99.0419636980834...|
|[-187.10813157845...|
|[-194.45958755232...|
|[-313.74164643283...|
+--------------------+
only showing top 20 rows



In [11]:
from pyspark.ml.feature import PolynomialExpansion

In [12]:
px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(data_scale_output)
for expanded in polyDF.select("polyFeatures").take(3):
  print(expanded)

Row(polyFeatures=DenseVector([540.0, 291600.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 162.0, 87480.0, 0.0, 0.0, 26244.0, 2.5, 1350.0, 0.0, 0.0, 405.0, 6.25, 1040.0, 561600.0, 0.0, 0.0, 168480.0, 2600.0, 1081600.0, 676.0, 365040.0, 0.0, 0.0, 109512.0, 1690.0, 703040.0, 456976.0, 28.0, 15120.0, 0.0, 0.0, 4536.0, 70.0, 29120.0, 18928.0, 784.0, 79.99, 43194.6, 0.0, 0.0, 12958.38, 199.975, 83189.6, 54073.24, 2239.72, 6398.4001]))
Row(polyFeatures=DenseVector([540.0, 291600.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 162.0, 87480.0, 0.0, 0.0, 26244.0, 2.5, 1350.0, 0.0, 0.0, 405.0, 6.25, 1055.0, 569700.0, 0.0, 0.0, 170910.0, 2637.5, 1113025.0, 676.0, 365040.0, 0.0, 0.0, 109512.0, 1690.0, 713180.0, 456976.0, 28.0, 15120.0, 0.0, 0.0, 4536.0, 70.0, 29540.0, 18928.0, 784.0, 61.89, 33420.6, 0.0, 0.0, 10026.18, 154.725, 65293.95, 41837.64, 1732.92, 3830.3721]))
Row(polyFeatures=DenseVector([332.5, 110556.25, 142.5, 47381.25, 20306.25, 0.0, 0.0, 0.0, 0.0, 228.0, 75810.0, 32490.0, 0.0, 51984.0, 0.0, 0.0, 0.0, 

In [14]:
polyDF.select("polyFeatures").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|polyFeatures                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+-------------------------------------------------------------------------------------