In [35]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("241212_01_MLlib_regression").getOrCreate()

## DATA LOAD

In [36]:
train_df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('data/house_train.csv')

In [37]:
test_df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('data/house_test.csv')

In [38]:
# sale price : label
# features : ????

In [39]:
train_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

## 전처리 - 결측치 = 0 / 숫자 타입 통일

In [40]:
# 결측치 0
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

## 인코딩 : 문자형 -> 숫자형

In [41]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

In [42]:
string_columns = ['Neighborhood']
# 문자 -> 1,2,3,4로 값을 단순화
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index")   for col in string_columns]

In [43]:
# onehotencoding : 범주형변수 값의 종류만큼 컬럼 생성-> 1 -> 1, 2->2 로 바꾸는 인코딩
# 모두 1로 바꾸기
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded")   for col in string_columns]

In [44]:
numeric_columns = [ "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
    "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea"]


## features selection

In [61]:
assembler_inputs = [col+"_encoded"  for col in string_columns] + numeric_columns

## asembler

In [46]:
assembler = VectorAssembler(inputCols= assembler_inputs, outputCol='features')

## label selection

In [47]:
train_df = train_df.withColumnRenamed("SalePrice", "label")

## pipeline 설정

StringIndex + OneHotEncoder + Assembler >> 하나의 SparkML Pipeline으로 결합

In [48]:
from pyspark.ml import Pipeline

In [54]:
pipeline = Pipeline(stages = indexers + encoders + [assembler] )

In [55]:
pipeline_model = pipeline.fit(train_df)

In [56]:
train_transformed = pipeline_model.transform(train_df) 

## 예측 > 회귀 모델 학습 > 평가 > 예측

In [57]:
from pyspark.ml.regression import LinearRegression

In [59]:
lr = LinearRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(train_transformed)

24/12/12 16:48:39 WARN Instrumentation: [7112c24f] regParam is zero, which might cause numerical instability and overfitting.
24/12/12 16:48:40 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/12 16:48:40 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/12 16:48:40 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/12 16:48:40 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [60]:
# 평가 데이터를 이용한 평가

test_transformed = pipeline_model.transform(test_df) 
predictions = lr_model.transform(test_transformed)

IllegalArgumentException: Data type string of column GarageCars is not supported.
Data type string of column GarageArea is not supported.