In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("spark_sql_basic2")
sc   = SparkContext(conf=conf)

RDD 만을 이용한 데이터 추출

In [2]:
movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

# 마블 영화 중 관객 수가 500만 이상인 영화를 가져오기

In [3]:
movie_att = movies_rdd.join(attendances_rdd)
movie_att = movie_att.sortBy(lambda x: x[0])
movie_att.take(10)

[(1, (('어벤져스', '마블'), (13934592, 'KR'))),
 (2, (('슈퍼맨', 'DC'), (2182227, 'KR'))),
 (3, (('배트맨', 'DC'), (4226242, 'KR'))),
 (4, (('겨울왕국', '디즈니'), (10303058, 'KR'))),
 (5, (('아이언맨', '마블'), (4300365, 'KR')))]

In [4]:
movie_att.filter(
    lambda x: x[1][0][1] == "마블" and x[1][1][0] > 5000000
).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [5]:
sc.stop()

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [7]:
# 컬럼 추가
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

### 스키마를 알아야 한다.

In [8]:
movie_schema = ["id", "name", "company", "year", "month", "day"]

### 2. 데이터 프레임 만들기

In [9]:
df = spark.createDataFrame(data=movies, schema=movie_schema)

df.show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [10]:
df.select("name").show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [11]:
df.filter(df.year >= 2010).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



In [12]:
# 2013 이후 영화
df.filter(df.year > 2013).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



In [13]:
# 마블 or DC
df.filter((df.company == "마블") | (df.company == "DC")).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [14]:
df.createOrReplaceTempView("movies")

### 영화 이름만 가져오기

In [15]:
query = """

SELECT name
  FROM movies

"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



### 2010년 이후에 개봉한 영화를 조회

In [16]:
query = """

SELECT *
FROM movies
WHERE year >= 2010

"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



### 2012년도 이전에 개봉한 영화의 이름과 회사를 출력

In [17]:
query = """

SELECT name, company
FROM movies
WHERE year < 2012

"""
spark.sql(query).show()

+--------+-------+
|    name|company|
+--------+-------+
|  배트맨|     DC|
|아이언맨|   마블|
+--------+-------+



### like 문자열 데이터에서 특정 단어나 문장을 포함한 데이터를 찾을 때

### % 기호를 사용해서 문장이 매칭되는지 확인 가!
### 제목이 ~맨으로 끝나는 데이터의 모든 정보를 조회

In [18]:
query = """

SELECT *
FROM movies
WHERE name like "%맨"

"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



### BETWEEN 특정 데이터와 데이터 사이를 조회

### 개봉 월이 4 ~ 8월 사이. 4 <= 개봉월 <= 8

In [19]:
query = """

SELECT *
FROM movies
WHERE month BETWEEN 4 and 8

"""
spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



### Join 구현하기

In [20]:
attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [21]:
# 직접 스키마 지정해 보기
from pyspark.sql.types import StringType, FloatType\
    , IntegerType\
    , StructType, StructField

In [22]:
# 모든 컬럼의 타입을 통칭 - 컬럼 데이터의 집합
# StructField : 컬럼
att_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("att", FloatType(), True),
    StructField("theater_country", StringType(), True)
])

In [23]:
att_df = spark.createDataFrame(
    data=attendances,
    schema=att_schema
)

att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [24]:
att_df.createOrReplaceTempView("att")

In [26]:
att_df.show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [27]:
q = '''
select movies.id, movies.name, movies.company, att.att
from movies
join att ON movies.id = att.id
'''

spark.sql(q).show()

+---+--------+-------+-----------+
| id|    name|company|        att|
+---+--------+-------+-----------+
|  1|어벤져스|   마블|1.3934592E7|
|  2|  슈퍼맨|     DC|  2182227.0|
|  3|  배트맨|     DC|  4226242.0|
|  4|겨울왕국| 디즈니|1.0303058E7|
|  5|아이언맨|   마블|  4300365.0|
+---+--------+-------+-----------+



In [48]:
spark.stop()

In [61]:
from pyspark.sql.functions import desc

In [49]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("trop_count_sql").getOrCreate()

In [56]:
zone_data = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('../learning_spark_data/taxi+_zone_lookup.csv')

In [62]:
zone_data.orderBy(desc('LocationID')).show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|       265|      Unknown|                  NA|         N/A|
|       264|      Unknown|                  NV|         N/A|
|       263|    Manhattan|      Yorkville West| Yellow Zone|
|       262|    Manhattan|      Yorkville East| Yellow Zone|
|       261|    Manhattan|  World Trade Center| Yellow Zone|
|       260|       Queens|            Woodside|   Boro Zone|
|       259|        Bronx|  Woodlawn/Wakefield|   Boro Zone|
|       258|       Queens|           Woodhaven|   Boro Zone|
|       257|     Brooklyn|     Windsor Terrace|   Boro Zone|
|       256|     Brooklyn|Williamsburg (Sou...|   Boro Zone|
|       255|     Brooklyn|Williamsburg (Nor...|   Boro Zone|
|       254|        Bronx|Williamsbridge/Ol...|   Boro Zone|
|       253|       Queens|       Willets Point|   Boro Zone|
|       252|       Queen

In [50]:
df  = spark.read.format('csv')\
            .option('header', 'true')\
            .option('inferSchema', 'true').load('../learning_spark_data/fhvhv_tripdata_2020-03.csv')

In [51]:
df.createOrReplaceTempView("df")

In [104]:
zone_data.createOrReplaceTempView("zone")

In [63]:
q = '''
select hvfhs_license_num, PULocationID, DOLocationID
from df
'''
df = spark.sql(q)
df.show()

+-----------------+------------+------------+
|hvfhs_license_num|PULocationID|DOLocationID|
+-----------------+------------+------------+
|           HV0005|          81|         159|
|           HV0005|         168|         119|
|           HV0003|         137|         209|
|           HV0003|         209|          80|
|           HV0003|         256|         226|
|           HV0003|          79|         263|
|           HV0003|          61|          29|
|           HV0003|         150|         150|
|           HV0003|         150|         210|
|           HV0003|          60|         167|
|           HV0003|          47|         213|
|           HV0003|         213|         235|
|           HV0003|         243|         153|
|           HV0003|         127|          18|
|           HV0003|          18|         169|
|           HV0003|          94|         169|
|           HV0003|         211|         158|
|           HV0003|         246|         107|
|           HV0003|         234|  

In [64]:
q = '''
select hvfhs_license_num
from df
group by hvfhs_license_num
'''
spark.sql(q).show()

+-----------------+
|hvfhs_license_num|
+-----------------+
|           HV0004|
|           HV0005|
|           HV0003|
+-----------------+



In [65]:
# 승차별 갯수 PULocationID
q = '''
select PULocationID, count(*) as cnt
from df
group by PULocationID
'''
spark.sql(q).show()

+------------+------+
|PULocationID|   cnt|
+------------+------+
|         148|116205|
|         243| 87431|
|          31|  5285|
|         137| 85552|
|          85| 46120|
|         251|  9080|
|          65| 66622|
|         255|113947|
|          53| 17571|
|         133| 27200|
|          78| 76155|
|         108| 20378|
|         155| 39527|
|         211| 61075|
|         193| 20111|
|          34| 11823|
|         115| 10806|
|         126| 52833|
|         101|  8983|
|          81| 41425|
+------------+------+
only showing top 20 rows



In [66]:
# 하차별 갯수 DOLocationID
q = '''
select DOLocationID, count(*) as cnt
from df
group by DOLocationID
'''
spark.sql(q).show()

+------------+------+
|DOLocationID|   cnt|
+------------+------+
|         148| 91601|
|         243| 86795|
|          31|  5526|
|          85| 44509|
|         137| 80098|
|         251|  8525|
|          65| 58888|
|         255|105051|
|          53| 19013|
|         133| 27760|
|          78| 74447|
|         155| 42239|
|         108| 21354|
|         211| 54176|
|         193| 19104|
|          34| 12392|
|         115|  9809|
|         101|  7218|
|         126| 59027|
|          81| 38445|
+------------+------+
only showing top 20 rows



In [90]:
# HV0003 운송사업자의 승차 지역별 트립 건수 집계하고
q = '''
select hvfhs_license_num, PULocationID, count(*) as cnt
from df
group by PULocationID, hvfhs_license_num
having hvfhs_license_num = 'HV0003'
'''
spark.sql(q).show()

+-----------------+------------+-----+
|hvfhs_license_num|PULocationID|  cnt|
+-----------------+------------+-----+
|           HV0003|         118|10500|
|           HV0003|           3|28232|
|           HV0003|         225|88749|
|           HV0003|          43|12074|
|           HV0003|          30|  429|
|           HV0003|         101| 6933|
|           HV0003|          72|67917|
|           HV0003|         239|50320|
|           HV0003|         171|20282|
|           HV0003|          98| 9257|
|           HV0003|         114|48781|
|           HV0003|         218|20759|
|           HV0003|          53|14645|
|           HV0003|          26|67620|
|           HV0003|          34| 8236|
|           HV0003|          94|33487|
|           HV0003|         129|81748|
|           HV0003|         146|23536|
|           HV0003|          47|51693|
|           HV0003|         216|64413|
+-----------------+------------+-----+
only showing top 20 rows



In [97]:
# 가장 많은 운송 사업자 순으로 정렬하는 분석 쿼리
# hvfhs_license_num

q = '''
select hvfhs_license_num, count(*) as cnt
from df
group by hvfhs_license_num
order by count(*) desc
'''
spark.sql(q).show()


+-----------------+-------+
|hvfhs_license_num|    cnt|
+-----------------+-------+
|           HV0003|9836763|
|           HV0005|3219535|
|           HV0004| 336606|
+-----------------+-------+



In [58]:
# 운송사별 운행 건수 비교
q = '''
select hvfhs_license_num, count(*) as cnt
from df
group by hvfhs_license_num
'''
spark.sql(q).show()

+-----------------+-------+
|hvfhs_license_num|    cnt|
+-----------------+-------+
|           HV0004| 336606|
|           HV0005|3219535|
|           HV0003|9836763|
+-----------------+-------+



In [115]:
q = '''
select *
from zone
'''
spark.sql(q).show(1)

+----------+-------+--------------+------------+
|LocationID|Borough|          Zone|service_zone|
+----------+-------+--------------+------------+
|         1|    EWR|Newark Airport|         EWR|
+----------+-------+--------------+------------+
only showing top 1 row



In [114]:
# 승차 위치 Borough 별 운행 건수

q = '''
select df.PULocationID, zone.Borough, count(*) as cnt
from df
join zone on zone.LocationID = df.PULocationID
group by zone.Borough, df.PULocationID
'''
spark.sql(q).show()

+------------+-------------+------+
|PULocationID|      Borough|   cnt|
+------------+-------------+------+
|         129|       Queens|102997|
|         116|    Manhattan| 70601|
|         250|        Bronx| 36232|
|         230|    Manhattan|121628|
|         145|       Queens| 82701|
|         244|    Manhattan|126621|
|         184|        Bronx|  2166|
|          27|       Queens|   348|
|         216|       Queens| 77366|
|         123|     Brooklyn| 42106|
|          52|     Brooklyn| 18144|
|           8|       Queens|   297|
|         152|    Manhattan| 36993|
|          93|       Queens|  3276|
|          72|     Brooklyn| 84575|
|          50|    Manhattan| 80762|
|         154|     Brooklyn|  1242|
|          63|     Brooklyn| 47676|
|         241|        Bronx| 65961|
|         214|Staten Island|  9204|
+------------+-------------+------+
only showing top 20 rows



In [117]:
# 서비스 존 별 승하차 건수

q = '''
select zone.service_zone, count(df.DOLocationID), count(df.PULocationID)
from df
join zone on zone.LocationID = df.PULocationID
group by zone.service_zone
'''
spark.sql(q).show()

+------------+-------------------+-------------------+
|service_zone|count(DOLocationID)|count(PULocationID)|
+------------+-------------------+-------------------+
|         EWR|                362|                362|
|         N/A|                845|                845|
| Yellow Zone|            4025190|            4025190|
|    Airports|             319610|             319610|
|   Boro Zone|            9046897|            9046897|
+------------+-------------------+-------------------+

