# 문제: zscore, cdf 계산

## pyspark 기본 셋팅

In [1]:
import os
import pyspark

myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

### 1-1: 성적데이터로 DataFrame을 생성.

In [3]:
from pyspark.sql.types import StructField, StructType, StringType, FloatType
import numpy as np
import pandas as pd

marks = [
    ('김하나','English', 100.0),
    ('김하나','Math', 80.0),
    ('임하나','English', 70.0),
    ('임하나','Math', 100.0),
    ('김갑돌','English', 82.3),
    ('김갑돌','Math', 98.5)
]

marksRdd = spark.sparkContext.parallelize(marks)

marksSchema = StructType([
    StructField("NAME", StringType(), True),
    StructField("SUBJECT", StringType(), True),
    StructField("MARKS", FloatType(), True)
])

marksDf = spark.createDataFrame(marksRdd, marksSchema)

marksDf.printSchema()

marksDf=marksDf.toPandas()
marksDf

root
 |-- NAME: string (nullable = true)
 |-- SUBJECT: string (nullable = true)
 |-- MARKS: float (nullable = true)



Unnamed: 0,NAME,SUBJECT,MARKS
0,김하나,English,100.0
1,김하나,Math,80.0
2,임하나,English,70.0
3,임하나,Math,100.0
4,김갑돌,English,82.300003
5,김갑돌,Math,98.5


### 1-2 zscore 컬럼을 생성.

In [4]:
from scipy import stats

In [5]:
X=marksDf['MARKS']
zs=stats.zscore(X)
marksDf['zscore']=zs
marksDf

Unnamed: 0,NAME,SUBJECT,MARKS,zscore
0,김하나,English,100.0,0.988108
1,김하나,Math,80.0,-0.725374
2,임하나,English,70.0,-1.582115
3,임하나,Math,100.0,0.988108
4,김갑돌,English,82.300003,-0.528323
5,김갑돌,Math,98.5,0.859597


### 1-3 cdf 컬럼을 생성.

In [6]:
from scipy.stats import norm

marksDf['cdf']=norm.cdf(0, loc=0, scale=1)
marksDf

Unnamed: 0,NAME,SUBJECT,MARKS,zscore,cdf
0,김하나,English,100.0,0.988108,0.5
1,김하나,Math,80.0,-0.725374,0.5
2,임하나,English,70.0,-1.582115,0.5
3,임하나,Math,100.0,0.988108,0.5
4,김갑돌,English,82.300003,-0.528323,0.5
5,김갑돌,Math,98.5,0.859597,0.5
