In [1]:
!which python

/opt/conda/bin/python


In [2]:
!which java

/usr/bin/java


In [3]:
!python --version

Python 3.11.6


In [5]:
!java -version

openjdk version "17.0.8.1" 2023-08-24
OpenJDK Runtime Environment (build 17.0.8.1+1-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.8.1+1-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [6]:
pip show pyspark

Name: pyspark
Version: 3.5.0
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/spark/python
Requires: py4j
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [7]:
#SPARK_HOME
import os
os.environ.get('SPARK_HOME')

'/usr/local/spark'

In [8]:
#JAVA_HOME
os.environ.get('JAVA_HOME')

In [9]:
#PYTHONPATH
os.environ.get('PYTHONPATH')

'/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip:/usr/local/spark/python:'

In [10]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining
#SparkContext.SparkSession

In [11]:
spark

In [12]:
spark.stop()

In [13]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining

In [15]:
spark

In [16]:
data = [('Alice',1), ('Bob', 2), ('Charlie',3) ]
type(data)

list

### DataFrame 객체(분산객체)를 생성 <> 판다스의 데이터프레임이 아님.

In [18]:
data1 = spark.createDataFrame( data, ['Name','Value'])
data1[1][1]

Column<'Value[1]'>

In [22]:
data1.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [23]:
data1.filter(data1.Name == 'Bob').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [24]:
data1.filter(data1.Value > 2).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [25]:
data1.createOrReplaceTempView('people')

In [26]:
spark.sql('select * from people').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [27]:
spark.sql( 'select * from people where Name="Bob"' ).show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [29]:
spark.sql( 'select * from people where Value>2' ).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



# RDD 객체 생성

In [30]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()

In [31]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5]) #직접 생성
rdd

ParallelCollectionRDD[26] at readRDDFromFile at PythonRDD.scala:289

### rdd 객체를 출력하는 함수 - n개를 지정

In [34]:
rdd.take(5)

[1, 2, 3, 4, 5]

### map 연산 : rdd 값으로 연산

In [35]:
squared_rdd = rdd.map(lambda x:x*x)
squared_rdd

PythonRDD[32] at RDD at PythonRDD.scala:53

In [36]:
rdd.take(3)

[1, 2, 3]

In [37]:
squared_rdd.take(3)

[1, 4, 9]

In [38]:
squared_rdd.collect()

[1, 4, 9, 16, 25]

#  ML lib

In [39]:
from pyspark.ml.regression import LinearRegression
import numpy as np
from pyspark.ml.feature import VectorAssembler

In [40]:
data_age = [('Alice',25), ('Bob', 30), ('Charlie',33) ]
data2 = spark.createDataFrame( data_age, ['Name','Age'])
data2

DataFrame[Name: string, Age: bigint]

In [41]:
data2.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 33|
+-------+---+



In [42]:
assembler = VectorAssembler(inputCols = ['Age'], outputCol='features')
vector_df = assembler.transform(data2)
vector_df

DataFrame[Name: string, Age: bigint, features: vector]

In [44]:
vector_df.show()

+-------+---+--------+
|   Name|Age|features|
+-------+---+--------+
|  Alice| 25|  [25.0]|
|    Bob| 30|  [30.0]|
|Charlie| 33|  [33.0]|
+-------+---+--------+



In [45]:
lr = LinearRegression(featuresCol='features', labelCol='Age')
model = lr.fit(vector_df)

In [46]:
pred = model.transform(vector_df)
pred

DataFrame[Name: string, Age: bigint, features: vector, prediction: double]

In [47]:
pred.show()

+-------+---+--------+-----------------+
|   Name|Age|features|       prediction|
+-------+---+--------+-----------------+
|  Alice| 25|  [25.0]|24.99999999999993|
|    Bob| 30|  [30.0]|30.00000000000001|
|Charlie| 33|  [33.0]|33.00000000000006|
+-------+---+--------+-----------------+



In [48]:
spark.stop()

# Streaming

In [60]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [61]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining

In [62]:
lines = spark.readStream.format('socket')\
            .option('host', 'localhost')\
            .option('port',9999)\
            .load()  #STREAMMING DATA

In [70]:
words = lines.select(explode(split(lines.value, ' ' )).alias('word')
words

SyntaxError: '(' was never closed (2701730103.py, line 1)

In [71]:
spark.stop()