In [1]:
from pyspark.sql import SparkSession

In [2]:
spark =SparkSession.builder.appName("ML").getOrCreate()

In [7]:
training=spark.read.csv("test1_4.csv",header=True,inferSchema=True)

In [8]:
training.show()

+----+---+----------+------+
|Name|age|Experience|Salary|
+----+---+----------+------+
|   A| 20|         2| 30000|
|   B| 32|         1| 20000|
|   C| 25|         3| 10000|
|   D| 40|         5| 32000|
|   E| 30|         1| 14000|
|   F| 14|         3| 30000|
+----+---+----------+------+



In [9]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
training.columns

['Name', 'age', 'Experience', 'Salary']

In [17]:
from pyspark.ml.feature import VectorAssembler
# 독립변수 선언
feature_assembler=VectorAssembler(inputCols=['age','Experience'],outputCol='Independent feature')

In [15]:
output=feature_assembler.transform(training)

In [19]:
#두 열을 묶어버림
output.show()

+----+---+----------+------+-------------------+
|Name|age|Experience|Salary|Independent feature|
+----+---+----------+------+-------------------+
|   A| 20|         2| 30000|         [20.0,2.0]|
|   B| 32|         1| 20000|         [32.0,1.0]|
|   C| 25|         3| 10000|         [25.0,3.0]|
|   D| 40|         5| 32000|         [40.0,5.0]|
|   E| 30|         1| 14000|         [30.0,1.0]|
|   F| 14|         3| 30000|         [14.0,3.0]|
+----+---+----------+------+-------------------+



In [18]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent feature']

In [20]:
final_data=output.select("Independent feature","Salary")

In [21]:
final_data.show()

+-------------------+------+
|Independent feature|Salary|
+-------------------+------+
|         [20.0,2.0]| 30000|
|         [32.0,1.0]| 20000|
|         [25.0,3.0]| 10000|
|         [40.0,5.0]| 32000|
|         [30.0,1.0]| 14000|
|         [14.0,3.0]| 30000|
+-------------------+------+



In [29]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = final_data.randomSplit([7.5, 2.5])

In [30]:
regressor=LinearRegression(featuresCol="Independent feature",labelCol='Salary')
regressor= regressor.fit(train_data)

In [31]:
regressor.coefficients

DenseVector([-103.4483, 3482.7586])

In [32]:
regressor.intercept

14896.551724137966

In [34]:
#예측
pred=regressor.evaluate(test_data)

In [35]:
pred.predictions.show()

+-------------------+------+------------------+
|Independent feature|Salary|        prediction|
+-------------------+------+------------------+
|         [14.0,3.0]| 30000|23896.551724137957|
|         [32.0,1.0]| 20000|15068.965517241373|
+-------------------+------+------------------+



In [36]:
pred.meanAbsoluteError,pred.meanSquaredError

(5517.241379310335, 30783590.963138994)