In [5]:
from pyspark import SparkConf, SparkContext

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FirstSparkSessionApp").getOrCreate() #builder : spark session 형성하는 객체 

In [8]:
myRange = spark.range(1000)
myRange

DataFrame[id: bigint]

In [9]:
myRange.show(10)

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+
only showing top 10 rows



                                                                                

### 데이터프레임 생성

createDataFrame

In [40]:
data = [
    ('Brook', 20, 'M'),
    ('Denny', 31, 'F'),
    ('Jules', 30, 'F'),
    ('Avery', 25, 'M'),
    ('Jordan', 28, 'F'),
    ('Casey', 27, 'M'),
    ('Taylor', 22, 'F'),
    ('Morgan', 29, 'F'),
    ('Jamie', 24, 'M'),
    ('Peyton', 26, 'F'),
    ('Riley', 19, 'M'),
    ('Quinn', 32, 'F'),
    ('Alex', 18, 'M'),
    ('Charlie', 34, 'F'),
    ('Sam', 17, 'M'),
    ('Blake', 15, 'M'),
    ('Harper', 14, 'F'),
    ('Cameron', 13, 'M'),
    ('Addison', 16, 'F'),
    ('Elliot', 12, 'M'),
    ('Parker', 40, 'F'),
    ('Rowan', 41, 'M'),
    ('Drew', 45, 'F'),
    ('Dakota', 50, 'M'),
    ('Hunter', 51, 'F'),
    ('Skylar', 35, 'M'),
    ('Emerson', 36, 'F'),
    ('Reese', 55, 'F'),
    ('Kendall', 60, 'M'),
    ('Phoenix', 61, 'F'),
    ('Sage', 62, 'M'),
    ('Ashton', 65, 'F'),
    ('Sawyer', 70, 'M'),
    ('River', 71, 'F'),
    ('Lennon', 75, 'M'),
    ('Marley', 80, 'F'),
    ('Hayden', 81, 'M'),
    ('Logan', 85, 'F'),
    ('Finley', 86, 'M'),
    ('Remy', 90, 'F'),
    ('Tatum', 91, 'M'),
    ('Sydney', 94, 'F'),
    ('Eden', 95, 'M'),
    ('Spencer', 88, 'M'),
    ('Carter', 48, 'M'),
    ('Mackenzie', 37, 'F'),
    ('Ellis', 20, 'M'),
    ('Greer', 22, 'F'),
    ('Wren', 23, 'F'),
    ('Bailey', 60, 'M'),
]


In [41]:
df = spark.createDataFrame(data)
df.show()

+-------+---+---+
|     _1| _2| _3|
+-------+---+---+
|  Brook| 20|  M|
|  Denny| 31|  F|
|  Jules| 30|  F|
|  Avery| 25|  M|
| Jordan| 28|  F|
|  Casey| 27|  M|
| Taylor| 22|  F|
| Morgan| 29|  F|
|  Jamie| 24|  M|
| Peyton| 26|  F|
|  Riley| 19|  M|
|  Quinn| 32|  F|
|   Alex| 18|  M|
|Charlie| 34|  F|
|    Sam| 17|  M|
|  Blake| 15|  M|
| Harper| 14|  F|
|Cameron| 13|  M|
|Addison| 16|  F|
| Elliot| 12|  M|
+-------+---+---+
only showing top 20 rows



In [42]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)
 |-- _3: string (nullable = true)



#### 테이블 뷰 (SQL 접근 가능)

createOrReplaceTempView

In [43]:
df.createOrReplaceTempView('people')

In [51]:
#sql
result = spark.sql('select _1, _2 from people where _2 > 30')
result.show()

+-------+---+
|     _1| _2|
+-------+---+
|  Denny| 31|
|  Quinn| 32|
|Charlie| 34|
| Parker| 40|
|  Rowan| 41|
|   Drew| 45|
| Dakota| 50|
| Hunter| 51|
| Skylar| 35|
|Emerson| 36|
|  Reese| 55|
|Kendall| 60|
|Phoenix| 61|
|   Sage| 62|
| Ashton| 65|
| Sawyer| 70|
|  River| 71|
| Lennon| 75|
| Marley| 80|
| Hayden| 81|
+-------+---+
only showing top 20 rows



#### 테이블 구조 설정

structType

In [28]:
from pyspark.sql.types import *

In [44]:
schema = StructType(
    [
        StructField('Author',StringType(),False),
        StructField('Age',IntegerType(),False),
        StructField('gender',StringType(), False),
    ]
)

In [45]:
schema

StructType(List(StructField(Author,StringType,false),StructField(Age,IntegerType,false),StructField(gender,StringType,false)))

In [47]:
a_df = spark.createDataFrame(data, schema)
a_df.show()

+-------+---+------+
| Author|Age|gender|
+-------+---+------+
|  Brook| 20|     M|
|  Denny| 31|     F|
|  Jules| 30|     F|
|  Avery| 25|     M|
| Jordan| 28|     F|
|  Casey| 27|     M|
| Taylor| 22|     F|
| Morgan| 29|     F|
|  Jamie| 24|     M|
| Peyton| 26|     F|
|  Riley| 19|     M|
|  Quinn| 32|     F|
|   Alex| 18|     M|
|Charlie| 34|     F|
|    Sam| 17|     M|
|  Blake| 15|     M|
| Harper| 14|     F|
|Cameron| 13|     M|
|Addison| 16|     F|
| Elliot| 12|     M|
+-------+---+------+
only showing top 20 rows



In [48]:
# define schema for our data
schema = StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)])

#create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [49]:
b_df = spark.createDataFrame(data, schema)
b_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [50]:
b_df.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [53]:
b_df.select('Id').show(2) #projection

+---+
| Id|
+---+
|  1|
|  2|
+---+
only showing top 2 rows



In [57]:
from pyspark.sql.functions import *

In [58]:
b_df.select('Hits').show()

+-----+
| Hits|
+-----+
| 4535|
| 8908|
| 7659|
|10568|
|40578|
|25568|
+-----+



In [59]:
b_df.select(expr('Hits')*2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [60]:
spark.stop()

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
