In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
marks=[
    "김하나, English, 100",
    "김하나, Math, 80",
    "임하나, English, 70",
    "임하나, Math, 100",
    "김갑돌, English, 82.3",
    "김갑돌, Math, 98.5"
]

## 1-1 성적데이터로 DataFrame을 생성.

In [5]:
_marksRdd=spark.sparkContext.parallelize(marks).map(lambda x:x.split(','))

In [6]:
_marksRdd.take(3)

[['김하나', ' English', ' 100'],
 ['김하나', ' Math', ' 80'],
 ['임하나', ' English', ' 70']]

In [9]:
_marksDf=spark.createDataFrame(_marksRdd, schema=["name", "subject", "score"])

In [10]:
_marksDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- score: string (nullable = true)



In [17]:
from pyspark.sql.types import FloatType

_marksDf = _marksDf.withColumn('scoref', _marksDf['score'].cast(FloatType()))

In [18]:
_marksDf.show()

+------+--------+-----+------+
|  name| subject|score|scoref|
+------+--------+-----+------+
|김하나| English|  100| 100.0|
|김하나|    Math|   80|  80.0|
|임하나| English|   70|  70.0|
|임하나|    Math|  100| 100.0|
|김갑돌| English| 82.3|  82.3|
|김갑돌|    Math| 98.5|  98.5|
+------+--------+-----+------+



In [19]:
_marksDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- score: string (nullable = true)
 |-- scoref: float (nullable = true)



## 1-2 zscore 컬럼을 생성.

In [27]:
_s=_marksDf.rdd.map(lambda x:x[3]).collect()

In [28]:
_s

[100.0, 80.0, 70.0, 100.0, 82.30000305175781, 98.5]

In [29]:
import numpy as np

xbar=np.mean(_s)
sigmax=np.std(_s) #ddof=0
sx=np.std(_s,ddof=1) #1이 기본 defalut

In [36]:
#_zscore=_marksDf.withColumn('zscore',(_marksDf['scoref']-xbar)/sx)
_marksDf=_marksDf.withColumn('zscore',(_marksDf['scoref']-xbar)/sx)

In [37]:
# _zscore.show() #이러면 marksDF가 바뀌는게 아닌 zscore 생성
_marksDf.show()

+------+--------+-----+------+-------------------+
|  name| subject|score|scoref|             zscore|
+------+--------+-----+------+-------------------+
|김하나| English|  100| 100.0|  0.902014804151829|
|김하나|    Math|   80|  80.0| -0.662172786480269|
|임하나| English|   70|  70.0| -1.444266581796318|
|임하나|    Math|  100| 100.0|  0.902014804151829|
|김갑돌| English| 82.3|  82.3|-0.4822909748814927|
|김갑돌|    Math| 98.5|  98.5| 0.7847007348544217|
+------+--------+-----+------+-------------------+



## 1-3 cdf 컬럼을 생성

In [38]:
from scipy.stats import norm
norm.cdf(0, loc=0, scale=1)#요게 함수

0.5

In [40]:
from pyspark.sql.functions import udf
contocdf=udf(lambda x: float(norm.cdf(x)))

In [42]:
_marksDf=_marksDf.withColumn("cdf", contocdf(_marksDf['zscore']))

In [43]:
_marksDf.show()

+------+--------+-----+------+-------------------+-------------------+
|  name| subject|score|scoref|             zscore|                cdf|
+------+--------+-----+------+-------------------+-------------------+
|김하나| English|  100| 100.0|  0.902014804151829| 0.8164754981807292|
|김하나|    Math|   80|  80.0| -0.662172786480269| 0.2539302463290559|
|임하나| English|   70|  70.0| -1.444266581796318| 0.0743320011235712|
|임하나|    Math|  100| 100.0|  0.902014804151829| 0.8164754981807292|
|김갑돌| English| 82.3|  82.3|-0.4822909748814927|0.31479962882028223|
|김갑돌|    Math| 98.5|  98.5| 0.7847007348544217| 0.7836854740814176|
+------+--------+-----+------+-------------------+-------------------+

