In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [2]:
df_pyspark = spark.read.csv('cep1_dataset.csv', header=True, inferSchema=True)

In [3]:
df_pyspark.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
| 57|  1|  0|     140| 192|  0|      1|    148|    0|    0.4|    1|  0|   1|     1|
| 56|  0|  1|     140| 294|  0|      0|    153|    0|    1.3|    1|  0|   2|     1|
| 44|  1|  1|     120| 263|  0|      1|    173|    0|    0.0|    2|  0|   3|     1|
| 52|  1|  2|     172| 199|  1|      1|    162|    0|    0.5|    2|  0|   3|

In [4]:
df_pyspark = df_pyspark.dropna()

In [5]:
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['age', 'trestbps', 'chol', 'thalach', 'sex'],
                                   outputCol='Independent Features')

df_pyspark = feature_assembler.transform(df_pyspark)
df_pyspark.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|Independent Features|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|[63.0,145.0,233.0...|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|[37.0,130.0,250.0...|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|[41.0,130.0,204.0...|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|[56.0,120.0,236.0...|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|[57.0,120.0,354.0...|
| 57|  1|  0|     140| 192|  0|      1|    148|    0|    0.4|    1|  0|   1|     1|[57.0,140.0,192.0...|
| 56|  0|  1|     140| 294|  0|      0|    153|    0|  

In [6]:
finalized_data = df_pyspark.select('Independent Features', 'target')
finalized_data.show()

+--------------------+------+
|Independent Features|target|
+--------------------+------+
|[63.0,145.0,233.0...|     1|
|[37.0,130.0,250.0...|     1|
|[41.0,130.0,204.0...|     1|
|[56.0,120.0,236.0...|     1|
|[57.0,120.0,354.0...|     1|
|[57.0,140.0,192.0...|     1|
|[56.0,140.0,294.0...|     1|
|[44.0,120.0,263.0...|     1|
|[52.0,172.0,199.0...|     1|
|[57.0,150.0,168.0...|     1|
|[54.0,140.0,239.0...|     1|
|[48.0,130.0,275.0...|     1|
|[49.0,130.0,266.0...|     1|
|[64.0,110.0,211.0...|     1|
|[58.0,150.0,283.0...|     1|
|[50.0,120.0,219.0...|     1|
|[58.0,120.0,340.0...|     1|
|[66.0,150.0,226.0...|     1|
|[43.0,150.0,247.0...|     1|
|[69.0,140.0,239.0...|     1|
+--------------------+------+
only showing top 20 rows



In [9]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75, 0.35])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='target')
regressor = regressor.fit(train_data)

In [10]:
regressor.coefficients

DenseVector([-0.0025, -0.0016, -0.0016, 0.0071, -0.3112])

In [11]:
regressor.intercept

0.42695680253546936

In [12]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+-------------------+
|Independent Features|target|         prediction|
+--------------------+------+-------------------+
|[29.0,130.0,204.0...|     1| 0.9543874779625223|
|[35.0,122.0,192.0...|     1| 0.7716799010591808|
|[37.0,120.0,215.0...|     1| 1.0167594559115924|
|[37.0,130.0,250.0...|     1| 0.7561173071155023|
|[38.0,120.0,231.0...|     0| 0.7633283258663719|
|[40.0,110.0,167.0...|     0| 0.3909346747707127|
|[40.0,140.0,199.0...|     1| 0.7482472413595007|
|[41.0,110.0,235.0...|     1| 0.5594022770476497|
|[41.0,112.0,250.0...|     1| 0.7174321798705117|
|[41.0,112.0,268.0...|     1| 0.9509391404182781|
|[41.0,126.0,306.0...|     1| 0.8060320360393634|
|[41.0,135.0,203.0...|     1|0.42092774100914804|
|[42.0,120.0,209.0...|     1|  1.034680655013279|
|[42.0,120.0,295.0...|     1| 0.5117485768085082|
|[42.0,136.0,315.0...|     0|0.19300964990865782|
|[42.0,148.0,244.0...|     1| 0.6606524993844374|
|[43.0,115.0,303.0...|     1| 0.6394339652680081|


In [13]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(0.35690636462110925, 0.1578018372800372)