# 데이터프레임

In [1]:
import pyspark
myConf = pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master('local')\
    .appName('myApp')\
    .config(conf = myConf)\
    .getOrCreate()

21/11/22 03:44:43 WARN Utils: Your hostname, Kritiasui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 172.30.1.23 instead (on interface en0)
21/11/22 03:44:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/11/22 03:44:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/22 03:44:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# 데이터프레임 생성

In [5]:
myList = [('1', 'kim, js', 170),
          ('1', 'lee, sm', 175),
          ('2', 'lim, yg', 180),
          ('2', 'lee', 170)]
# 자동으로 Schema 설정됨
myDf = spark.createDataFrame(myList)
myDf.printSchema()
print(myDf.take(1))

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)

[Row(_1='1', _2='kim, js', _3=170)]


In [6]:
myDf = spark.createDataFrame(myList, ['year', 'name', 'height'])
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)



### Row객체를 사용해서 생성
> Row : 이름이 붙여진 행

In [7]:
from pyspark.sql import Row
Person = Row('year', 'name', 'height')
row1 = Person('1', 'kim, js', 170)

In [10]:
print("row1:", row1.year, row1.name, row1.height)

row1: 1 kim, js 170


In [11]:
row1_dict = row1.asDict()
print(row1_dict.keys())
print(row1_dict.values())

dict_keys(['year', 'name', 'height'])
dict_values(['1', 'kim, js', 170])


- Row객체를 이용해서 데이터를 만들면, 별개의 Schema인수가 필요하지 않다.

In [24]:
myRows = [Person(*x) for x in myList]
myDf = spark.createDataFrame(myRows)
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)



### DataType 정의

In [26]:
from pyspark.sql.types import *
mySchema = StructType([
    StructField('year', StringType(), nullable = True),
    StructField('name', StringType(), nullable = True),
    StructField('height', IntegerType(), nullable = True)
])

In [28]:
myDf = spark.createDataFrame(myRows, mySchema)
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: integer (nullable = true)



## RDD에서 생성

In [35]:
# toDF를 이용한 생성
myRdd = spark.sparkContext.parallelize(myList)
myDfFromRdd = myRdd.toDF()
myDfFromRdd.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [36]:
# createDataFrame을 이용한 생성
myDfFromRdd = spark.createDataFrame(myRdd)
myDfFromRdd.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



### Row를 사용

In [37]:
from pyspark.sql import Row
_myRdd = myRdd.map(lambda x : Row(year = int(x[0]), name = x[1], height = int(x[2])))
_myDf = spark.createDataFrame(_myRdd)
_myDf.printSchema()

root
 |-- year: long (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)



In [38]:
# Row를 사용해서 RDD생성
r1 = Row(name = 'a', age = 10)
r2 = Row(name = 'b', age = 20)
rRdd = spark.sparkContext.parallelize([r1, r2])
rRdd.collect()

[Row(name='a', age=10), Row(name='b', age=20)]

In [40]:
# Row타입의 RDD를 DF로 생성
rRdd.toDF().printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



### Schema를 완전히 정해서 생성

In [44]:
myRdd = spark.sparkContext.parallelize([(1, 'kim', 50.0), (2, 'lee', 60.0), (3, 'park', 70.0)])
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('height', DoubleType(), True)
])
myDf = spark.createDataFrame(myRdd, schema)
myDf.printSchema()
myDf.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- height: double (nullable = true)

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
|  3|park|  70.0|
+---+----+------+



## Pandas에서 생성

In [46]:
myDf.toPandas()

Unnamed: 0,id,name,height
0,1,kim,50.0
1,2,lee,60.0
2,3,park,70.0


보류

## CSV, TSV에서 생성

In [48]:
import os
cfile = os.path.join('data', 'spark_2cols.csv')

RDD 통해서 읽기

In [49]:
lines = spark.sparkContext.textFile(cfile)
_col12 = lines.map(lambda l : l.split(','))
col12 = _col12.map(lambda p : Row(col1 = int(p[0].strip()), col2 = int(p[1].strip())))
_myDf = spark.createDataFrame(col12)
_myDf.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)



DataFrame 직접 읽기

In [54]:
df = spark.read.format('com.databricks.spark.csv').options(header = 'false', inferschama = 'true', delimiter = ',')\
    .load(cfile)
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [55]:
df = spark.read.options(header = 'false', inferschema = 'true', delimiter = ',').csv(cfile)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)



### tsv파일
- delimiter를 \t으로 설정해서 읽을 수 있다.

## JSON에서 생성

In [57]:
import os
fname = os.path.join('data', 'LandActualPriceInfo.json')
with open(fname, 'rb') as f:
    data = f.readlines()

In [74]:
import json
data_json_str = json.loads(data[0])

In [76]:
data_json_str = b'[' + b','.join(data) + b']'

In [77]:
import pandas as pd
infoPd = pd.read_json(data_json_str)

In [94]:
infoDf = spark.read.json(fname)
infoDf.printSchema()

root
 |-- landActualPriceInfo: struct (nullable = true)
 |    |-- RESULT: struct (nullable = true)
 |    |    |-- CODE: string (nullable = true)
 |    |    |-- MESSAGE: string (nullable = true)
 |    |-- list_total_count: long (nullable = true)
 |    |-- row: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- ACC_YEAR: string (nullable = true)
 |    |    |    |-- BJDONG10_CD: string (nullable = true)
 |    |    |    |-- BJDONG_NM: string (nullable = true)
 |    |    |    |-- BLDG_AREA: string (nullable = true)
 |    |    |    |-- BLDG_MUSE_CD: string (nullable = true)
 |    |    |    |-- BLDG_MUSE_NM: string (nullable = true)
 |    |    |    |-- BLDG_NM: string (nullable = true)
 |    |    |    |-- BUILD_YEAR: string (nullable = true)
 |    |    |    |-- DEAL_YMD: string (nullable = true)
 |    |    |    |-- FLR_INFO: string (nullable = true)
 |    |    |    |-- JOB_GBN: string (nullable = true)
 |    |    |    |-- JOB_GBN_NM: string (nulla

In [96]:
infoDf.select('landActualPriceInfo').show()

+------------------------+
|     landActualPriceInfo|
+------------------------+
|{{INFO-000, 정상 처리...|
+------------------------+





# 데이터프레임 조작

### 빈 데이터프레임

In [98]:
from pyspark.sql.types import *

schema = StructType([])
emptyDf = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
emptyDf.printSchema()

root



### Range를 이용한 데이터프레임
- 데이터프레임을 만들지 않고 함수를 실행하기에 적절하다

In [99]:
spark.range(0, 10, 2).show()

+---+
| id|
+---+
|  0|
|  2|
|  4|
|  6|
|  8|
+---+



In [102]:
from pyspark.sql import functions as F
spark.range(1).select(F.unix_timestamp().alias('current_timestamp')).show()
# 결과값은 rdd로 변환한 후 indexing을 통해 가져올 수 있다.

+-----------------+
|current_timestamp|
+-----------------+
|       1637540753|
+-----------------+



### 행 추가 및 삭제

#### withColumn & drop

In [117]:
df = spark.read\
    .options(header = 'false', inferschema = 'true', delimiter = '\t')\
    .csv(os.path.join('data', 'spark_heightweight.txt'))
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)



In [118]:
df = df.withColumn('id', df['_c0'].cast('integer')).drop('_c0')
df = df.withColumn('height', df['_c1'].cast('double')).drop('_c1')
df = df.withColumn('weight', df['_c2'].cast('double')).drop('_c2')
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- weight: double (nullable = true)



## UDF : User Defined Functions
- 함수명과 반환값, lambda함수를 가지고 만드는 사용자정의함수
- withColumn등의 과정에서는 함수를 udf를 통해 호출해야 한다.
- 코드가 복잡한 경우에 함수를 분리해서 처리할 수도 있다.

In [122]:
myDf.printSchema()
myDf.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- height: double (nullable = true)

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
|  3|park|  70.0|
+---+----+------+



#### Upper UDF

In [123]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf

def uppercase(s):
    return s.upper()

upperUdf = udf(uppercase, StringType())

In [124]:
myDf.withColumn('nameUpper', upperUdf(myDf['name'])).show()

+---+----+------+---------+
| id|name|height|nameUpper|
+---+----+------+---------+
|  1| kim|  50.0|      KIM|
|  2| lee|  60.0|      LEE|
|  3|park|  70.0|     PARK|
+---+----+------+---------+



#### 어디까지 줄일 수 있을까?

In [None]:
myDf.withColumn('nameUpper', udf(lambda x : x.upper(),StringType())(myDf['name'])).show()

+---+----+------+---------+
| id|name|height|nameUpper|
+---+----+------+---------+
|  1| kim|  50.0|      KIM|
|  2| lee|  60.0|      LEE|
|  3|park|  70.0|     PARK|
+---+----+------+---------+



#### 분기형 idf

In [128]:
height_udf = udf(lambda height : 'taller' if height >= 60 else 'shorter', StringType())
heightDf = myDf.withColumn('height > 175', height_udf(myDf['height']))
heightDf.show()

+---+----+------+------------+
| id|name|height|height > 175|
+---+----+------+------------+
|  1| kim|  50.0|     shorter|
|  2| lee|  60.0|      taller|
|  3|park|  70.0|      taller|
+---+----+------+------------+



### 컬럼 명 변경

In [133]:
df.withColumnRenamed('id', 'ID').show(5)

+---+------+------+
| ID|height|weight|
+---+------+------+
|  1| 65.78|112.99|
|  2| 71.52|136.49|
|  3|  69.4|153.03|
|  4| 68.22|142.34|
|  5| 67.79| 144.3|
+---+------+------+
only showing top 5 rows



### aggregate functions

#### dictionary형식
- agg() 함수에 dictionary형식으로 컬럼명: aggregate functions로 적어준다.

In [136]:
df.agg({'height' : 'avg', 'weight' : 'sum'}).show()

+-----------+-----------------+
|sum(weight)|      avg(height)|
+-----------+-----------------+
|     6442.1|68.05240000000002|
+-----------+-----------------+



#### F함수
- pyspark.sql.functions에 내장된 함수

In [138]:
from pyspark.sql import functions as F
df.agg(F.min('height')).show()

+-----------+
|min(height)|
+-----------+
|      63.48|
+-----------+



## 조회 - select, where, groupby

### Select
- 부분 데이터프레임을 복사해 반환한다.

In [143]:
query = ['name', 'id']
withoutHeight = myDf.select('name', 'id') # *query 를 넣어주어도 작동함
withoutHeight.show()

+----+---+
|name| id|
+----+---+
| kim|  1|
| lee|  2|
|park|  3|
+----+---+



#### Column을 List로 변환하는 방법
문제점: Row()로 구성된 컬럼에서 값만 빼기가 불편하다.  

1. select한 결과를 rdd로 만든다.
2. rdd를 x[0]으로 인덱싱하는 map을 거친다. (flatMap을 사용해도 된다.)
3. collect해준다.

In [144]:
myDf.select('name').rdd.map(lambda x : x[0]).collect()

['kim', 'lee', 'park']

#### select like

In [147]:
myDf.select('name', 'height', myDf['name'].like('%lee%')).show()

+----+------+---------------+
|name|height|name LIKE %lee%|
+----+------+---------------+
| kim|  50.0|          false|
| lee|  60.0|           true|
|park|  70.0|          false|
+----+------+---------------+



#### select startswith | endswith

In [151]:
myDf.select('name', 'height', myDf.name.startswith('k')).show()
myDf.select('name', 'height', myDf.name.endswith('k')).show()

+----+------+-------------------+
|name|height|startswith(name, k)|
+----+------+-------------------+
| kim|  50.0|               true|
| lee|  60.0|              false|
|park|  70.0|              false|
+----+------+-------------------+

+----+------+-----------------+
|name|height|endswith(name, k)|
+----+------+-----------------+
| kim|  50.0|            false|
| lee|  60.0|            false|
|park|  70.0|             true|
+----+------+-----------------+



#### alias

In [154]:
# dataframe의 이름도 바꿀 수 있다.
myDf1 = myDf.alias('myDf1')

In [155]:
# columns의 이름을 바꾸는 데에 주로 쓰인다.
myDf1.select(myDf1.name.substr(1, 2).alias('short name')).show()

+----------+
|short name|
+----------+
|        ki|
|        le|
|        pa|
+----------+



#### when - otherwise : if - else문의 DF버전

In [157]:
from pyspark.sql.functions import when
myDf.select('height', when(myDf.height < 60, 1).otherwise(0)).show()

+------+-----------------------------------------+
|height|CASE WHEN (height < 60) THEN 1 ELSE 0 END|
+------+-----------------------------------------+
|  50.0|                                        1|
|  60.0|                                        0|
|  70.0|                                        0|
+------+-----------------------------------------+



#### where | filter

In [160]:
myDf.where(myDf['height'] <= 60).show()

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
+---+----+------+



In [162]:
myDf.filter(myDf['height'] <= 60).show()

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
+---+----+------+



#### regexp_replace : regular expression - 컬럼 내용 변경
- regexp_replace(참조할 컬럼 명, 바뀌어질 대상 문자열, 바뀌는 결과 문자열)

In [163]:
from pyspark.sql.functions import *
_heightDf = myDf.withColumn('nameNew', regexp_replace('name', 'lee', 'lim'))
_heightDf.show()

+---+----+------+-------+
| id|name|height|nameNew|
+---+----+------+-------+
|  1| kim|  50.0|    kim|
|  2| lee|  60.0|    lim|
|  3|park|  70.0|   park|
+---+----+------+-------+



### groupby
- groupby나, groupBy나 같은 함수이다.
- 특정 컬럼을 기준으로 구분지어서 각종 함수를 적용할 수 있다.

In [165]:
myDf.groupby(myDf['height']).max().show()



+------+-------+-----------+
|height|max(id)|max(height)|
+------+-------+-----------+
|  70.0|      3|       70.0|
|  50.0|      1|       50.0|
|  60.0|      2|       60.0|
+------+-------+-----------+





#### agg를 사용하는 groupby
- avg, max, min, sum, count를 지원한다.

In [167]:
myDf.groupby('height').agg({'height' : 'avg'}).show()



+------+-----------+
|height|avg(height)|
+------+-----------+
|  70.0|       70.0|
|  50.0|       50.0|
|  60.0|       60.0|
+------+-----------+





#### pivot을 이용해 2차원 테이블 만들기

In [168]:
myDf.groupBy('height').pivot('name').count().show()



+------+----+----+----+
|height| kim| lee|park|
+------+----+----+----+
|  70.0|null|null|   1|
|  50.0|   1|null|null|
|  60.0|null|   1|null|
+------+----+----+----+



### 행 추가 : union()

In [171]:
toAppendDf = spark.createDataFrame([Row(4, 'keen', 78.0)])
myDf = myDf.union(toAppendDf)
myDf.show()

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
|  3|park|  70.0|
|  4|keen|  78.0|
+---+----+------+



### Partition 조작

In [174]:
# partition 개수
print(myDf.rdd.getNumPartitions())
# partition 개수 조작
_myDf = myDf.repartition(4) # 늘리거나 줄일 때
# partition 개수 조작
_myDf = _myDf.coalesce(2)


2


## 통계 요약

In [175]:
myDf.describe().show()

+-------+------------------+----+------------------+
|summary|                id|name|            height|
+-------+------------------+----+------------------+
|  count|                 4|   4|                 4|
|   mean|               2.5|null|              64.5|
| stddev|1.2909944487358056|null|12.151817422372122|
|    min|                 1|keen|              50.0|
|    max|                 4|park|              78.0|
+-------+------------------+----+------------------+



## 결측값

### df.na.fill()
> 모든 컬럼의 na값을 0으로 교체

In [179]:
pivotDf = myDf.groupBy('height').pivot('name').count()
pivotDf.where(F.col('lee').isNull()).show()



+------+----+----+----+----+
|height|keen| kim| lee|park|
+------+----+----+----+----+
|  70.0|null|null|null|   1|
|  50.0|null|   1|null|null|
|  78.0|   1|null|null|null|
+------+----+----+----+----+



# Spark SQL

In [185]:
import os
import json

fname = os.path.join('data', 'LandActualPriceInfo.json')
with open(fname, 'rb') as file:
    data = file.readlines()
data = json.loads(data[0])['landActualPriceInfo']['row']
myRdd = spark.sparkContext.parallelize(data)
myRowRdd = myRdd.map(lambda x: Row(gu = x['SGG_NM'], area = float(x['BLDG_AREA']), cost = int(x['OBJ_AMT']), kind = x['BLDG_MUSE_NM']))
df = spark.createDataFrame(myRowRdd)
df.printSchema()

root
 |-- gu: string (nullable = true)
 |-- area: double (nullable = true)
 |-- cost: long (nullable = true)
 |-- kind: string (nullable = true)



21/11/22 12:49:40 WARN TaskSetManager: Stage 196 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.


In [186]:
from pyspark.sql.functions import isnan, when, count, col

cols = df.columns
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in cols]).show()

21/11/22 12:51:57 WARN TaskSetManager: Stage 197 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.


+---+----+----+----+
| gu|area|cost|kind|
+---+----+----+----+
|  0|   0|   0|   0|
+---+----+----+----+



### sql string
- sql을 문자열로 적으면 알아서 해준다.

In [196]:
df.createOrReplaceTempView('my')
spark.sql('select gu, area, cost from my').show(1)

21/11/22 14:57:46 WARN TaskSetManager: Stage 203 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.


+------+-----+---------+
|    gu| area|     cost|
+------+-----+---------+
|송파구|23.88|340000000|
+------+-----+---------+
only showing top 1 row



#### catalog로 view목록 확인

In [197]:
spark.catalog.listTables()

[Table(name='my', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [198]:
for e in df.rdd.map(lambda x: '구: ' + x[0]).take(5):
    print(e)

구: 송파구
구: 중구
구: 동작구
구: 성북구
구: 송파구


21/11/22 15:43:41 WARN TaskSetManager: Stage 205 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.


#### truncate
- 출력할 때 긴 문자열을 잘라 출력할지 결정하는 인자이다.

In [202]:
bucketDf = spark.createDataFrame([[1, ['orange', 'apple', 'pineapple']],
                                   [2, ['watermelon', 'apple', 'bananas']]],
                                 ['bucketId', 'items'])

In [204]:
bucketDf.show(3, truncate = False)
bucketDf.show(3, truncate = True)

+--------+----------------------------+
|bucketId|items                       |
+--------+----------------------------+
|1       |[orange, apple, pineapple]  |
|2       |[watermelon, apple, bananas]|
+--------+----------------------------+

+--------+--------------------+
|bucketId|               items|
+--------+--------------------+
|       1|[orange, apple, p...|
|       2|[watermelon, appl...|
+--------+--------------------+



#### explode
- 컬럼에 List나 배열이 포함된 경우 flat해서 새로운 컬럼을 생성한다.

In [206]:
from pyspark.sql.functions import explode
bDf = bucketDf.select(bucketDf.bucketId, explode(bucketDf.items).alias('item'))
bDf.show()

+--------+----------+
|bucketId|      item|
+--------+----------+
|       1|    orange|
|       1|     apple|
|       1| pineapple|
|       2|watermelon|
|       2|     apple|
|       2|   bananas|
+--------+----------+



#### join
- inner기준으로, item이 중복되는 것만 병합한다.

In [208]:
fDf = spark.createDataFrame([['orange', 'F1'],
                             ['', 'F2'],
                             ['pineapple', 'F3'],
                             ['watermelon', 'F4'],
                             ['bananas', 'F5']], 
                            ['item', 'itemId'])
fDf.show()

+----------+------+
|      item|itemId|
+----------+------+
|    orange|    F1|
|          |    F2|
| pineapple|    F3|
|watermelon|    F4|
|   bananas|    F5|
+----------+------+



In [210]:
joinDf = fDf.join(bDf, fDf.item == bDf.item, 'inner') # 합칠 대상, 조건, 속성
joinDf.show()



+----------+------+--------+----------+
|      item|itemId|bucketId|      item|
+----------+------+--------+----------+
|   bananas|    F5|       2|   bananas|
|    orange|    F1|       1|    orange|
| pineapple|    F3|       1| pineapple|
|watermelon|    F4|       2|watermelon|
+----------+------+--------+----------+





### 형변환

#### DateType 형변환

In [187]:
from datetime import datetime
print(datetime.strptime('11/25/1991', '%m/%d/%Y'))

1991-11-25 00:00:00


In [189]:
from pyspark.sql.functions import to_date
df = df.withColumn('date', to_date(df['myDate'], 'yyyy-MM-dd'))

#### 범용적 해결책: cast()

In [194]:
newDf = df.withColumn('areaInt', df['area'].cast(IntegerType()))
newDf.show(5)

+------+-----+---------+--------+-------+
|    gu| area|     cost|    kind|areaInt|
+------+-----+---------+--------+-------+
|송파구|23.88|340000000|  아파트|     23|
|  중구|46.74|420000000|오피스텔|     46|
|동작구| 68.1|279000000|연립주택|     68|
|성북구|29.96|330000000|연립주택|     29|
|송파구|36.35|154000000|연립주택|     36|
+------+-----+---------+--------+-------+
only showing top 5 rows



21/11/22 14:54:28 WARN TaskSetManager: Stage 201 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.


In [195]:
newDf = df.withColumn('areaInt', df['area'].cast('integer'))
newDf.show(5)

+------+-----+---------+--------+-------+
|    gu| area|     cost|    kind|areaInt|
+------+-----+---------+--------+-------+
|송파구|23.88|340000000|  아파트|     23|
|  중구|46.74|420000000|오피스텔|     46|
|동작구| 68.1|279000000|연립주택|     68|
|성북구|29.96|330000000|연립주택|     29|
|송파구|36.35|154000000|연립주택|     36|
+------+-----+---------+--------+-------+
only showing top 5 rows



21/11/22 14:55:06 WARN TaskSetManager: Stage 202 contains a task of very large size (1264 KiB). The maximum recommended task size is 1000 KiB.
