In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('241212_01_MLlib_regression').getOrCreate()

In [4]:
train_df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/house_train.csv')

                                                                                

In [5]:
test_df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/house_test.csv')

In [6]:
train_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

### 전처리

In [None]:
#GarageArea, GarageCars integer로 변환 -> cast()

In [13]:
train_df = train_df.withColumn( "GarageArea", train_df["GarageArea"].cast("integer") )
test_df = test_df.withColumn( "GarageArea",  test_df["GarageArea"].cast("integer") )

In [14]:
train_df = train_df.withColumn( "GarageCars", train_df["GarageCars"].cast("integer") )
test_df = test_df.withColumn( "GarageCars",  test_df["GarageCars"].cast("integer") )

In [15]:
train_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [16]:
# 결측치 0 
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [17]:
#encoding : 문자형 -> 숫자형 1,2,3,4

In [18]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

In [19]:
string_columns = ['Neighborhood']
# 1,2,3,4 로 값을 단순화
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index")   for col in string_columns]

In [20]:
# onehotencoding : 범주형변수 1,2,3,4, -> 1로 바꾸는 인코딩
# 모두 1로 바꾼다.
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded")   for col in string_columns]

## features selection

In [21]:
numeric_columns = [ "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
    "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea"]

In [23]:
#문자형(인코딩) + 숫자형 feature를 결합한 모델 생성 
assembler_inputs = [col+"_encoded" for col in string_columns] + numeric_columns

In [24]:
# assembler 
assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = "features")

## Label Selection

In [25]:
train_df = train_df.withColumnRenamed("SalePrice","label")

## Pipelie setting

In [26]:
from pyspark.ml import Pipeline

In [27]:
pipeline = Pipeline(stages = indexers+encoders+[assembler])

In [28]:
pipeline_model = pipeline.fit( train_df )

                                                                                

In [29]:
train_transformed = pipeline_model.transform( train_df )

## 예측 -> 회귀 모델 학습 -> 평가 -> 예측

In [30]:
from pyspark.ml.regression import LinearRegression

In [31]:
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_transformed)

24/12/13 10:28:28 WARN Instrumentation: [1a254bcd] regParam is zero, which might cause numerical instability and overfitting.
24/12/13 10:28:28 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/13 10:28:28 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/13 10:28:28 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/13 10:28:28 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [35]:
#fit : 모델 맞춤 과정 , 테스트 데이터에 의해 규칙이 변환됨
#따라서 test model은 fit 과정 없이 transform 과정 진행 

test_transformed = pipeline_model.transform(test_df)
predictions= lr_model.transform(test_transformed) #transform: 시험 봄 

In [37]:
predictions.select("id","features","prediction").show(10, truncate = False)

+----+-------------------------------------------------------------------------------------------------------+------------------+
|id  |features                                                                                               |prediction        |
+----+-------------------------------------------------------------------------------------------------------+------------------+
|1461|(34,[0,24,25,26,27,28,29,31,32,33],[1.0,11622.0,5.0,6.0,1961.0,1961.0,896.0,896.0,1.0,730.0])          |114113.60325331613|
|1462|(34,[0,24,25,26,27,28,29,31,32,33],[1.0,14267.0,6.0,6.0,1958.0,1958.0,1329.0,1329.0,1.0,312.0])        |156145.8445868329 |
|1463|(34,[5,24,25,26,27,28,29,30,31,32,33],[1.0,13830.0,5.0,5.0,1997.0,1998.0,928.0,701.0,1629.0,2.0,482.0])|168254.6697326172 |
|1464|(34,[5,24,25,26,27,28,29,30,31,32,33],[1.0,9978.0,6.0,6.0,1998.0,1998.0,926.0,678.0,1604.0,2.0,470.0]) |186898.44701529457|
|1465|(34,[18,24,25,26,27,28,29,31,32,33],[1.0,5005.0,8.0,5.0,1992.0,1992.0,1280.0,1280.0,

In [39]:
#예측 결과 저장

predictions.select("id", "prediction")\
    .withColumnRenamed("prediction", "SalePrice")\
    .write.csv('data/output/house_prediction.csv', header = True, mode = "overwrite")

In [47]:
model_save_path = 'data/output/boston_housing_lr_model'
pipeline_save_path = 'data/output/boston_housing_pipline_model'

#pipline
pipeline_model.write().overwrite().save(pipeline_save_path)

#linear regression model
lr_model.write().overwrite().save(model_save_path)
print('model_saved...')

model_saved...


## Load Model & Pipeline

In [63]:
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import PipelineModel

In [64]:
loaded_pipeline = PipelineModel.load(pipeline_save_path)
loaded_pipeline


loaded_model = LinearRegressionModel.load(model_save_path)
loaded_model

LinearRegressionModel: uid=LinearRegression_27be3aa8f05f, numFeatures=34

In [53]:
#pip install pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m146.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.


In [54]:
import pandas as pd 

# 새로운 데이터 샘플 생성
data = {
    "Id": [1461],
    "MSSubClass": [20],
    "MSZoning": ["RH"],
    "LotFrontage": [80],
    "LotArea": [11622],
    "Street": ["Pave"],
    "Alley": [None],  # NA를 None으로 표현
    "LotShape": ["Reg"],
    "LandContour": ["Lvl"],
    "Utilities": ["AllPub"],
    "LotConfig": ["Inside"],
    "LandSlope": ["Gtl"],
    "Neighborhood": ["NAmes"],
    "Condition1": ["Feedr"],
    "Condition2": ["Norm"],
    "BldgType": ["1Fam"],
    "HouseStyle": ["1Story"],
    "OverallQual": [5],
    "OverallCond": [6],
    "YearBuilt": [1961],
    "YearRemodAdd": [1961],
    "RoofStyle": ["Gable"],
    "RoofMatl": ["CompShg"],
    "Exterior1st": ["VinylSd"],
    "Exterior2nd": ["VinylSd"],
    "MasVnrType": [None],  # None은 NA를 의미
    "MasVnrArea": [0],
    "ExterQual": ["TA"],
    "ExterCond": ["TA"],
    "Foundation": ["CBlock"],
    "BsmtQual": ["TA"],
    "BsmtCond": ["TA"],
    "BsmtExposure": ["No"],
    "BsmtFinType1": ["Rec"],
    "BsmtFinSF1": [468],
    "BsmtFinType2": ["LwQ"],
    "BsmtFinSF2": [144],
    "BsmtUnfSF": [270],
    "TotalBsmtSF": [882],
    "Heating": ["GasA"],
    "HeatingQC": ["TA"],
    "CentralAir": ["Y"],
    "Electrical": ["SBrkr"],
    "1stFlrSF": [896],
    "2ndFlrSF": [0],
    "LowQualFinSF": [0],
    "GrLivArea": [896],
    "BsmtFullBath": [0],
    "BsmtHalfBath": [0],
    "FullBath": [1],
    "HalfBath": [0],
    "BedroomAbvGr": [2],
    "KitchenAbvGr": [1],
    "KitchenQual": ["TA"],
    "TotRmsAbvGrd": [5],
    "Functional": ["Typ"],
    "Fireplaces": [0],
    "FireplaceQu": [None],  # NA를 None으로 표현
    "GarageType": ["Attchd"],
    "GarageYrBlt": [1961],
    "GarageFinish": ["Unf"],
    "GarageCars": [1],
    "GarageArea": [730],
    "GarageQual": ["TA"],
    "GarageCond": ["TA"],
    "PavedDrive": ["Y"],
    "WoodDeckSF": [140],
    "OpenPorchSF": [0],
    "EnclosedPorch": [0],
    "3SsnPorch": [0],
    "ScreenPorch": [120],
    "PoolArea": [0],
    "PoolQC": [None],  # NA를 None으로 표현
    "Fence": ["MnPrv"],
    "MiscFeature": [None],  # NA를 None으로 표현
    "MiscVal": [0],
    "MoSold": [6],
    "YrSold": [2010],
    "SaleType": ["WD"],
    "SaleCondition":["Normal"]
}

pd.DataFrame(data).to_csv('data/new_test_data.csv', index = False)

In [55]:
new_test_data = spark.read.csv('data/new_test_data.csv', header = True, inferSchema = True)

In [56]:
# 필요한 특성만 선택 (파이프라인에서 사용된 특성들)
# 수치형 컬럼 + 범주형 컬럼 정의
selected_features = [
    "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
    "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea", "Neighborhood"
]

In [58]:
# 데이터 타입 변환 및 필요한 특성 선택
new_test_data = new_test_data.withColumn("GarageCars", new_test_data["GarageCars"].cast("integer"))
new_test_data = new_test_data.withColumn("GarageArea", new_test_data["GarageArea"].cast("integer")) 

In [None]:
# Input New Data to Pipeline

In [65]:
new_pipe = loaded_pipeline.transform(new_test_data)

In [66]:
# Input Model to Pipeline

In [67]:
new_pred = loaded_model.transform(new_pipe)

In [68]:
new_pred.select("prediction").show()

+------------------+
|        prediction|
+------------------+
|114113.60325331613|
+------------------+



In [69]:
#csv 저장

In [70]:
#Log를 남긴다 - csv 저장, DB에 저장 

In [71]:
spark.stop()