In [1]:
import os
src_path = os.getcwd()

In [2]:
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')

In [3]:
from pyspark.sql import SQLContext
sql = SQLContext(sc)

In [4]:
#훈련 데이터로부터 dataframe을 만듬
housing_df = sql.read.csv(src_path + '/train.csv', header=True, inferSchema=True)

In [5]:
housing_df.show(4)

+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|  3|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 4 rows



In [6]:
#spark에서는 입력 데이터셋에 모든 훈련 특징을 나타내는 숫자들의 벡터가 있는 단일 열이 필요
#CountVectorizer에서 비슷한 기능
from pyspark.ml.feature import VectorAssembler
training_features = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
vector_assembler = VectorAssembler(inputCols=training_features, outputCol='features')
df_with_features_vector = vector_assembler.transform(housing_df)

In [7]:
df_with_features_vector.show(4)

+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|            features|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|  3|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
only showing top 4 rows



In [9]:
#데이터 프레임을 훈련용과 테스트용으로 분리
train_df, test_df = df_with_features_vector.randomSplit([0.8, 0.2], seed=17)

In [10]:
#선형회귀기를 인스턴스화 하고 모델을 피팅
from pyspark.ml.regression import LinearRegression

linear = LinearRegression(featuresCol='features', labelCol='medv')
linear_model = linear.fit(train_df)

In [11]:
#예측 값 구함
predictions_df = linear_model.transform(test_df)

In [12]:
predictions_df.show(4)

+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|            features|        prediction|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...| 31.32098687370162|
|  7|0.08829|12.5| 7.87|   0|0.524|6.012|66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|[0.08829,12.5,7.8...|22.702179170212247|
| 12|0.11747|12.5| 7.87|   0|0.524|6.009|82.9|6.2267|  5|311|   15.2| 396.9|13.27|18.9|[0.11747,12.5,7.8...| 21.19338299760661|
| 21|1.25179| 0.0| 8.14|   0|0.538| 5.57|98.1|3.7979|  4|307|   21.0|376.57|21.02|13.6|[1.25179,0.0,8.14...|13.026016734659294|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+-----------------

In [13]:
#Regression Evaluator을 이용한 R2값 구하기
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='medv', predictionCol='prediction',
                               metricName='r2')
evaluator.evaluate(predictions_df)

0.6599665255387566

In [14]:
#파이프라인을 이용한 훈련
from pyspark.ml import Pipeline

linear = LinearRegression(featuresCol='features', labelCol='medv')
pipeline = Pipeline(stages=[vector_assembler, linear])

In [15]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
param_grid = ParamGridBuilder() \
            .addGrid(linear.elasticNetParam, [0.01, 0.02, 0.05]) \
            .addGrid(linear.solver, ['normal', 'l-bfgs']) \
            .addGrid(linear.regParam, [0.4, 0.5, 0.6]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=10)

optimized_model = crossval.fit(housing_df)

In [16]:
[(k.name, v) for (k, v) in optimized_model.bestModel.stages[1].extractParamMap().items()]

[('aggregationDepth', 2),
 ('elasticNetParam', 0.05),
 ('epsilon', 1.35),
 ('featuresCol', 'features'),
 ('fitIntercept', True),
 ('labelCol', 'medv'),
 ('loss', 'squaredError'),
 ('maxIter', 100),
 ('predictionCol', 'prediction'),
 ('regParam', 0.4),
 ('solver', 'l-bfgs'),
 ('standardization', True),
 ('tol', 1e-06)]

In [17]:
optimized_model.avgMetrics

[0.6923323960037392,
 0.6923959499732874,
 0.6923502431104525,
 0.6923330103241978,
 0.692396061514483,
 0.692349562517069,
 0.6924196977934123,
 0.6924732106548477,
 0.6924168566051652,
 0.6924201717986757,
 0.6924732826313917,
 0.6924160898881038,
 0.692678590380535,
 0.6926286783262435,
 0.6923471608287199,
 0.6926789836249727,
 0.6926276596010446,
 0.6923446609885943]

In [18]:
_, evaluation_df = housing_df.randomSplit([0.8, 0.2], seed=17)
evaluator.evaluate(optimized_model.transform(evaluation_df))

0.6810905508925078