In [1]:
import pyspark

myConf= pyspark.SparkConf()

spark = pyspark.sql.SparkSession\
    .builder\
    .master('local')\
    .appName('myApp')\
    .config(conf=myConf)\
    .getOrCreate()

In [2]:
marks=[
    "김하나, English, 100",
    "김하나, Math, 80",
    "임하나, English, 70",
    "임하나, Math, 100",
    "김갑돌, English, 82.3",
    "김갑돌, Math, 98.5"]

### 1-1 성적데이터로 DataFrame을 생성

In [3]:
_rdd= spark.sparkContext.parallelize(marks)

In [4]:
myrdd= _rdd\
    .map(lambda data: data.split(', '))\
    .map(lambda data: (data[0], data[1], float(data[2])))\
    .collect()

In [5]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType

_schema= StructType([
    StructField("name", StringType(), True),
    StructField("subject",StringType(), True),
    StructField("score", FloatType(), True)
])

In [6]:
myDf= spark.createDataFrame(myrdd, _schema)

In [7]:
myDf.show()

+------+-------+-----+
|  name|subject|score|
+------+-------+-----+
|김하나|English|100.0|
|김하나|   Math| 80.0|
|임하나|English| 70.0|
|임하나|   Math|100.0|
|김갑돌|English| 82.3|
|김갑돌|   Math| 98.5|
+------+-------+-----+



### 1-2 zscore 컬럼을 생성

In [8]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import stddev

zsDf=myDf\
    .agg(mean(myDf.score).alias('avg'), stddev(myDf.score).alias('std'))
zsDf.show()

+-----------------+------------------+
|              avg|               std|
+-----------------+------------------+
|88.46666717529297|12.786190172956093|
+-----------------+------------------+



In [9]:
avg_std= zsDf.rdd.map(lambda data: [data[0], data[1]]).collect()
print(avg_std)

[[88.46666717529297, 12.786190172956093]]


In [10]:
myDf= myDf.withColumn('zscore', (myDf['score']-avg_std[0][0])/avg_std[0][1])
myDf.show()

+------+-------+-----+-------------------+
|  name|subject|score|             zscore|
+------+-------+-----+-------------------+
|김하나|English|100.0|  0.902014804151829|
|김하나|   Math| 80.0| -0.662172786480269|
|임하나|English| 70.0| -1.444266581796318|
|임하나|   Math|100.0|  0.902014804151829|
|김갑돌|English| 82.3|-0.4822909748814927|
|김갑돌|   Math| 98.5| 0.7847007348544217|
+------+-------+-----+-------------------+



### 1-3 cdf 컬럼을 생성

In [11]:
from scipy.stats import norm
from pyspark.sql.functions import udf

cdfudf= udf(lambda data: float(norm.cdf(data)))
# !!!!!
#udf(lambda data: norm.cdf(data)) -> return type: 'numpy.float64'
#udf(lambda data: float(norm.cdf(data))) -> return type: 'float'

In [12]:
myDf= myDf\
    .withColumn('cdf', cdfudf(myDf['zscore']))
myDf.show()

+------+-------+-----+-------------------+-------------------+
|  name|subject|score|             zscore|                cdf|
+------+-------+-----+-------------------+-------------------+
|김하나|English|100.0|  0.902014804151829| 0.8164754981807292|
|김하나|   Math| 80.0| -0.662172786480269| 0.2539302463290559|
|임하나|English| 70.0| -1.444266581796318| 0.0743320011235712|
|임하나|   Math|100.0|  0.902014804151829| 0.8164754981807292|
|김갑돌|English| 82.3|-0.4822909748814927|0.31479962882028223|
|김갑돌|   Math| 98.5| 0.7847007348544217| 0.7836854740814176|
+------+-------+-----+-------------------+-------------------+

