In [1]:
import numpy as np
import pandas as pd
import os
import pyspark
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import lit,col
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

sc = pyspark.SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()

In [2]:
## 데이터프레임으로 테이블 생성 ##

trainDF = spark.read.format('csv')\
.option('header','true')\
.option('inferSchema','true')\
.load('C:/Users/Cloudy/spark_code/Spark-The-Definitive-Guide-master/data/final-data/train.csv')

testDF = spark.read.format('csv')\
.option('header','true')\
.option('inferSchema','true')\
.load('C:/Users/Cloudy/spark_code/Spark-The-Definitive-Guide-master/data/final-data/test.csv')

In [3]:
#1. 데이터의 결측치는 0으로 채우기

trainDF = trainDF.na.fill(0)
testDF = testDF.na.fill(0)

train = trainDF.withColumn('testOrtrain',lit('train'))
test  = testDF.withColumn('testOrtrain',lit('test'))

In [4]:
## 데이터 입력을 위한 벡터화

df = train.union(test)
va = VectorAssembler(inputCols = ['Pclass','Age','SibSp','Parch','Fare'], outputCol='features')
pipeline = Pipeline(stages=[va])
model = pipeline.fit(df)
myDF = model.transform(df)
myDF.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|testOrtrain|            features|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-----------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|      train|[3.0,22.0,1.0,0.0...|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      train|[1.0,38.0,1.0,0.0...|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|      train|[3.0,26.0,0.0,0.0...|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       

In [5]:
## 데이터 검증을 위한 데이터 구분

train = myDF.filter(myDF['testOrtrain']=='train')
trainDF, validateDF = train.randomSplit([0.7,0.3], seed = 11)
testDF = myDF.filter(myDF['testOrtrain']=='test')

In [6]:
#2. Train data 중 datatype이 string인 칼럼들 제거

train_ = trainDF

for i in range(len(trainDF.dtypes)):
    if trainDF.dtypes[i][1] == 'string':
        train_ = train_.drop(trainDF.dtypes[i][0])
        
trainDF = train_
trainDF.dtypes 

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Fare', 'double'),
 ('features', 'vector')]

In [7]:
#3. Test data 중 성별 칼럼을 제외한 datatype이 string인 칼럼들 제거

test_ = testDF

for i in range(len(testDF.dtypes)):
    if (testDF.dtypes[i][1] == 'string') and (testDF.dtypes[i][0]!='Sex'):
        test_ = test_.drop(test.dtypes[i][0])

testDF = test_
testDF.dtypes  

[('PassengerId', 'int'),
 ('Survived', 'int'),
 ('Pclass', 'int'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Fare', 'double'),
 ('features', 'vector')]

In [8]:
#4. 학습에는 Pclass, Age, SibSp, Parch, Fare 칼럼만 사용('features')

## LogisticRegression 모델 생성
lr = (LogisticRegression().
      setLabelCol('Survived').
      setFeaturesCol('features').
      setRegParam(0.0).
      setMaxIter(100).
      setElasticNetParam(0.))

lrModel = lr.fit(trainDF)

In [9]:
#5. 테스트 데이터 중 여성 승객의 데이터만 테스트

femailDF = testDF.filter(testDF.Sex == 'female')
femailDF.show()

## 모델예측 & 모델평가
lrDF = lrModel.transform(femailDF)
lrDF.groupBy('prediction','Survived').count().show()
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')
print(evaluator.evaluate(lrDF)*100,'%')

+-----------+--------+------+------+----+-----+-----+--------+--------------------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|            features|
+-----------+--------+------+------+----+-----+-----+--------+--------------------+
|        698|       1|     3|female| 0.0|    0|    0|  7.7333|(5,[0,4],[3.0,7.7...|
|        701|       1|     1|female|18.0|    1|    0| 227.525|[1.0,18.0,1.0,0.0...|
|        703|       0|     3|female|18.0|    0|    1| 14.4542|[3.0,18.0,0.0,1.0...|
|        707|       1|     2|female|45.0|    0|    0|    13.5|[2.0,45.0,0.0,0.0...|
|        709|       1|     1|female|22.0|    0|    0|  151.55|[1.0,22.0,0.0,0.0...|
|        711|       1|     1|female|24.0|    0|    0| 49.5042|[1.0,24.0,0.0,0.0...|
|        717|       1|     1|female|38.0|    0|    0| 227.525|[1.0,38.0,0.0,0.0...|
|        718|       1|     2|female|27.0|    0|    0|    10.5|[2.0,27.0,0.0,0.0...|
|        721|       1|     2|female| 6.0|    0|    1|    33.0|[2.0,6.0,0.0,1

In [10]:
#1) train 데이터와 test 데이터 개수

df.groupBy('testOrtrain').count().show()

+-----------+-----+
|testOrtrain|count|
+-----------+-----+
|      train|  697|
|       test|  192|
+-----------+-----+



In [11]:
# 2) 학습에 사용된 train 데이터 테이블(모든 칼럼 5개까지 표시)

trainDF.show(5)

+-----------+--------+------+----+-----+-----+-------+--------------------+
|PassengerId|Survived|Pclass| Age|SibSp|Parch|   Fare|            features|
+-----------+--------+------+----+-----+-----+-------+--------------------+
|          1|       0|     3|22.0|    1|    0|   7.25|[3.0,22.0,1.0,0.0...|
|          2|       1|     1|38.0|    1|    0|71.2833|[1.0,38.0,1.0,0.0...|
|          3|       1|     3|26.0|    0|    0|  7.925|[3.0,26.0,0.0,0.0...|
|          4|       1|     1|35.0|    1|    0|   53.1|[1.0,35.0,1.0,0.0...|
|          5|       0|     3|35.0|    0|    0|   8.05|[3.0,35.0,0.0,0.0...|
+-----------+--------+------+----+-----+-----+-------+--------------------+
only showing top 5 rows



In [12]:
# 3) 학습에 사용된 test 데이터 테이블(모든 칼럼 5개까지 표시)

testDF.show(5)

+-----------+--------+------+------+----+-----+-----+--------+--------------------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|            features|
+-----------+--------+------+------+----+-----+-----+--------+--------------------+
|        698|       1|     3|female| 0.0|    0|    0|  7.7333|(5,[0,4],[3.0,7.7...|
|        699|       0|     1|  male|49.0|    1|    1|110.8833|[1.0,49.0,1.0,1.0...|
|        700|       0|     3|  male|42.0|    0|    0|    7.65|[3.0,42.0,0.0,0.0...|
|        701|       1|     1|female|18.0|    1|    0| 227.525|[1.0,18.0,1.0,0.0...|
|        702|       1|     1|  male|35.0|    0|    0| 26.2875|[1.0,35.0,0.0,0.0...|
+-----------+--------+------+------+----+-----+-----+--------+--------------------+
only showing top 5 rows



In [13]:
# 4) Lab8에서 다룬 Logistic regression 모델을 사용한 예측 모델의 정확도 결과

## 모델 예측
lrDF = lrModel.transform(validateDF)
lrDF.groupBy('prediction','Survived').count().show()

## 모델 평가
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')
print(evaluator.evaluate(lrDF)*100,'%')

+----------+--------+-----+
|prediction|Survived|count|
+----------+--------+-----+
|       1.0|       0|   20|
|       0.0|       0|  104|
|       0.0|       1|   41|
|       1.0|       1|   37|
+----------+--------+-----+

65.65343258891647 %
