In [1]:
import os
import requests
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.master('local').appName('myApp').config(conf=pyspark.SparkConf()).getOrCreate()

21/10/11 04:46:11 WARN Utils: Your hostname, Kritiasui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 172.30.1.24 instead (on interface en0)
21/10/11 04:46:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/10/11 04:46:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/11 04:46:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sql('set spark.sql.legacy.timeParserPolicy=LEGACY')

DataFrame[key: string, value: string]

## Setting

In [6]:
from pyspark.sql.types import StructType

schema = StructType([])
emptyDf = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
emptyDf.printSchema()

root



## range : 일련의 수를 가진 DF 생성

- 데이터프레임을 만들지 않고 함수를 실행해보기에 유용하다.

In [7]:
spark.range(0, 10, 2).show()



+---+
| id|
+---+
|  0|
|  2|
|  4|
|  6|
|  8|
+---+





In [8]:
from pyspark.sql import functions as F
spark.range(1).select(F.current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2021-10-11|
+--------------+



In [14]:
spark.range(1).select(F.unix_timestamp().alias('current_timestamp')).show()

+-----------------+
|current_timestamp|
+-----------------+
|       1633900290|
+-----------------+



In [17]:
spark.range(1).select(F.unix_timestamp().alias('current_timestamp')).rdd.collect()[0]['current_timestamp']

1633900349

## withColumn, Drop

In [18]:
tDf = spark.read.options(header='false', inferschema = 'true', delimiter ='\t').csv(os.path.join('data', 'ds_spark_heightweight.txt'))
tDf.columns

['_c0', '_c1', '_c2']

In [20]:
tDf = tDf.withColumn('id', tDf._c0.cast('integer'))
tDf = tDf.withColumn('height', tDf['_c1'].cast('double'))
tDf = tDf.withColumn('weight', tDf['_c2'].cast('double'))
tDf.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- id: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- weight: double (nullable = true)



In [22]:
tDf = tDf.drop('_c0').drop('_c1').drop('_c2')
tDf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- weight: double (nullable = true)



In [23]:
tDf.take(2)

[Row(id=1, height=65.78, weight=112.99),
 Row(id=2, height=71.52, weight=136.49)]

# UDF : User Defined Functions

In [40]:
myDf = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(os.path.join('data', 'myDf.csv'))
myDf.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- height: integer (nullable = true)



In [41]:
myDf.show()

+---+----+-------+------+
|_c0|year|   name|height|
+---+----+-------+------+
|  0|   1|kim, js|   170|
|  1|   1|lee, sm|   175|
|  2|   2|lim, yg|   180|
|  3|   2|    lee|   170|
+---+----+-------+------+



21/10/11 06:25:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , year, name, height
 Schema: _c0, year, name, height
Expected: _c0 but found: 
CSV file: file:///Users/elplaguister/Workspace/Univ_BigDataAnalysis/Week6/data/myDf.csv


In [43]:
def uppercase(s):
    return s.upper()

uppercase('hello World')

'HELLO WORLD'

In [36]:
myDf = myDf.withColumn('NAME', uppercase(myDf.name))

TypeError: 'Column' object is not callable

In [44]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
upperUDF = udf(uppercase, StringType())

In [45]:
myDf = myDf.withColumn('NAM_E', upperUDF(myDf.name))

In [46]:
myDf.show()


+---+----+-------+------+-------+
|_c0|year|   name|height|  NAM_E|
+---+----+-------+------+-------+
|  0|   1|kim, js|   170|KIM, JS|
|  1|   1|lee, sm|   175|LEE, SM|
|  2|   2|lim, yg|   180|LIM, YG|
|  3|   2|    lee|   170|    LEE|
+---+----+-------+------+-------+



21/10/11 06:25:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , year, name, height
 Schema: _c0, year, name, height
Expected: _c0 but found: 
CSV file: file:///Users/elplaguister/Workspace/Univ_BigDataAnalysis/Week6/data/myDf.csv


In [48]:
from pyspark.sql.types import DoubleType

toDoublefunc = udf(lambda x: float(x), DoubleType())
myDf = myDf.withColumn('heightD', toDoublefunc(myDf.height))
myDf.dtypes

[('_c0', 'int'),
 ('year', 'int'),
 ('name', 'string'),
 ('height', 'int'),
 ('NAM_E', 'string'),
 ('heightD', 'double')]

## UDF함수로 조건에 따른 withColumn

In [50]:
height_udf = udf(lambda x : 'taller' if x >= 175 else 'shorter', StringType())
heightDf = myDf.withColumn('height>175', height_udf(myDf.height))
heightDf.show()

+---+----+-------+------+-------+-------+----------+
|_c0|year|   name|height|  NAM_E|heightD|height>175|
+---+----+-------+------+-------+-------+----------+
|  0|   1|kim, js|   170|KIM, JS|  170.0|   shorter|
|  1|   1|lee, sm|   175|LEE, SM|  175.0|    taller|
|  2|   2|lim, yg|   180|LIM, YG|  180.0|    taller|
|  3|   2|    lee|   170|    LEE|  170.0|   shorter|
+---+----+-------+------+-------+-------+----------+



21/10/11 06:29:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , year, name, height
 Schema: _c0, year, name, height
Expected: _c0 but found: 
CSV file: file:///Users/elplaguister/Workspace/Univ_BigDataAnalysis/Week6/data/myDf.csv


## 컬럼 명 변경 : withColumnRenamed

In [53]:
tDf = tDf.withColumnRenamed('id', 'ID')
tDf.show(3)

+---+------+------+
| ID|height|weight|
+---+------+------+
|  1| 65.78|112.99|
|  2| 71.52|136.49|
|  3|  69.4|153.03|
+---+------+------+
only showing top 3 rows

