In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("241212_01_MLlib_classification").getOrCreate()

In [4]:
df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/titanic.csv')

                                                                                

In [5]:
df.show(10)

+--------+------+------+----+-----+-----+-------+-----------+
|survived|pclass|   sex| age|sibsp|parch|   fare|embark_town|
+--------+------+------+----+-----+-----+-------+-----------+
|       0|     3|  male|22.0|    1|    0|   7.25|Southampton|
|       1|     1|female|38.0|    1|    0|71.2833|  Cherbourg|
|       1|     3|female|26.0|    0|    0|  7.925|Southampton|
|       1|     1|female|35.0|    1|    0|   53.1|Southampton|
|       0|     3|  male|35.0|    0|    0|   8.05|Southampton|
|       0|     3|  male|null|    0|    0| 8.4583| Queenstown|
|       0|     1|  male|54.0|    0|    0|51.8625|Southampton|
|       0|     3|  male| 2.0|    3|    1| 21.075|Southampton|
|       1|     3|female|27.0|    0|    2|11.1333|Southampton|
|       1|     2|female|14.0|    1|    0|30.0708|  Cherbourg|
+--------+------+------+----+-----+-----+-------+-----------+
only showing top 10 rows



# Missing Value

In [9]:
from pyspark.sql.functions import col, sum, isnan, when

null_counts = df.select(
    [
       sum( when(col(c).isNull() | isnan(c),1).otherwise(0)).alias(c) for c in df.columns
    ]
).show()

+--------+------+---+---+-----+-----+----+-----------+
|survived|pclass|sex|age|sibsp|parch|fare|embark_town|
+--------+------+---+---+-----+-----+----+-----------+
|       0|     0|  0|177|    0|    0|   0|          2|
+--------+------+---+---+-----+-----+----+-----------+



# Feature Selection

In [10]:
data = df.select("survived", "pclass", "sex", "age", "sibsp", "parch", "fare")
data.show()

+--------+------+------+----+-----+-----+-------+
|survived|pclass|   sex| age|sibsp|parch|   fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|null|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
|       1|     3|female| 4.0|    1|    1|   16.7|
|       1|     1|female|58.0|    0|    0|  26.55|
|       0|     3|  male|20.0|    0|    0|   8.05|
|       0|     3|  male|39.0|    1|    5| 31.275|
|       0|     3|female|14.0|    0|    0| 7.8542|
|       1|     2|female|55.0|    0|    0|   16.0|
|       0|     3|  male| 2.0|    4|    1| 29.125|


In [17]:
#Null 값 평균갑으로 대체 
mean_age = data.select("age").agg({"age" : "mean"}).collect()[0][0]
mean_age

29.69911764705882

In [20]:
data = data.fillna({ "age" : mean_age })
data.show(10)

+--------+------+------+-----------------+-----+-----+-------+
|survived|pclass|   sex|              age|sibsp|parch|   fare|
+--------+------+------+-----------------+-----+-----+-------+
|       0|     3|  male|             22.0|    1|    0|   7.25|
|       1|     1|female|             38.0|    1|    0|71.2833|
|       1|     3|female|             26.0|    0|    0|  7.925|
|       1|     1|female|             35.0|    1|    0|   53.1|
|       0|     3|  male|             35.0|    0|    0|   8.05|
|       0|     3|  male|29.69911764705882|    0|    0| 8.4583|
|       0|     1|  male|             54.0|    0|    0|51.8625|
|       0|     3|  male|              2.0|    3|    1| 21.075|
|       1|     3|female|             27.0|    0|    2|11.1333|
|       1|     2|female|             14.0|    1|    0|30.0708|
+--------+------+------+-----------------+-----+-----+-------+
only showing top 10 rows



In [32]:
#encoding :: category type -> numeric
#StringIndexer는 레이블의 문자열 컬럼을 레이블 인덱스의 컬럼으로 인코딩

# from pyspark.ml.feature import StringIndexer, VectorAssembler

# indexer = StringIndexer(inputCol = "sex", outputCol = "SexIndex")
# data =indexer.fit(data).transform(data)
data.show(5)


+--------+------+------+----+-----+-----+-------+--------+
|survived|pclass|   sex| age|sibsp|parch|   fare|SexIndex|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|     0.0|
|       1|     1|female|38.0|    1|    0|71.2833|     1.0|
|       1|     3|female|26.0|    0|    0|  7.925|     1.0|
|       1|     1|female|35.0|    1|    0|   53.1|     1.0|
|       0|     3|  male|35.0|    0|    0|   8.05|     0.0|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



## Feature vector

In [43]:
#target을 제외한 학습을 위한 feature assemble : 독립변수들 정하기
# assembler = VectorAssembler(
#     inputCols = ["pclass", "SexIndex", "age", "sibsp", "parch", "fare"],
#     outputCol = "features"
# )
# data = assembler.transform(data)
data

DataFrame[survived: int, pclass: int, sex: string, age: double, sibsp: int, parch: int, fare: double, SexIndex: double, features: vector]

In [44]:
#지도학습, 학습 데이터 생성 (분류모델)
data.select("survived", "features").show(5)

+--------+--------------------+
|survived|            features|
+--------+--------------------+
|       0|[3.0,0.0,22.0,1.0...|
|       1|[1.0,1.0,38.0,1.0...|
|       1|[3.0,1.0,26.0,0.0...|
|       1|[1.0,1.0,35.0,1.0...|
|       0|[3.0,0.0,35.0,0.0...|
+--------+--------------------+
only showing top 5 rows



## Create Model

In [42]:
#ML 모델: 데이터 학습 -> 평가 -> 모델 완성 
# dataset 분할 

In [46]:
#train 80 test 20 
train_data, test_data = data.randomSplit([0.8, 0.2], seed = 42)

In [49]:
train_data.count(), test_data.count()

(746, 145)

In [53]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression( featuresCol = "features", labelCol = "survived" )

In [54]:
#모델 학습

lr_model =  lr.fit(train_data)

24/12/12 14:14:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/12 14:14:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [56]:
# 테스트

predictions = lr_model.transform(test_data)
predictions.show(5)

+--------+------+------+-----------------+-----+-----+-------+--------+--------------------+--------------------+--------------------+----------+
|survived|pclass|   sex|              age|sibsp|parch|   fare|SexIndex|            features|       rawPrediction|         probability|prediction|
+--------+------+------+-----------------+-----+-----+-------+--------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|             50.0|    0|    0|28.7125|     1.0|[1.0,1.0,50.0,0.0...|[-1.9520246347246...|[0.12433276014445...|       1.0|
|       0|     1|  male|             21.0|    0|    1|77.2875|     0.0|[1.0,0.0,21.0,0.0...|[-0.5063684917057...|[0.37604522093222...|       1.0|
|       0|     1|  male|             24.0|    0|    0|   79.2|     0.0|[1.0,0.0,24.0,0.0...|[-0.5000163743656...|[0.37753682076914...|       1.0|
|       0|     1|  male|             29.0|    0|    0|   30.0|     0.0|[1.0,0.0,29.0,0.0...|[-0.1615623337462...|[0.45969704

In [57]:
predictions.select("features","survived","prediction").tail(5)

[Row(features=DenseVector([3.0, 0.0, 29.6991, 0.0, 0.0, 56.4958]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 29.6991, 2.0, 0.0, 23.25]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 31.0, 0.0, 0.0, 7.925]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 32.0, 0.0, 0.0, 56.4958]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 39.0, 0.0, 0.0, 7.925]), survived=1, prediction=0.0)]

In [60]:
#정답개수 확인

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import expr

predictions = predictions.withColumn("survived", col("survived").cast(IntegerType()))
predictions = predictions.withColumn("prediction", col("prediction").cast(IntegerType()))

In [64]:
compare = predictions.withColumn("correct", expr("case when survived = prediction then 1 else 0 end"))
compare.where('correct = 0').count() #틀린개수

28

In [66]:
compare.where('correct = 1').count() #맞은 개수

117

### 정확도 계산 -> 모델의 점수

In [65]:
compare.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.8068965517241379

In [67]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [72]:

# 분류모델의 평가지표 중 AUC 계산
evaluator= BinaryClassificationEvaluator( labelCol = "survived" , rawPredictionCol = 'rawPrediction', metricName = "areaUnderROC")

In [73]:
auc = evaluator.evaluate(predictions)
auc

0.8664129586260734

In [74]:
spark.stop()