In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySpark Example').getOrCreate()

24/12/03 10:29:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/03 10:29:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [9]:
#RDD 생성 - 분산 데이터 객체

rdd = spark.sparkContext.parallelize( [1,2,3,4,5] )
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [12]:
rdd.take(5)

[1, 2, 3, 4, 5]

In [14]:
squared_rdd = rdd.map( lambda x : x * x)
squared_rdd

PythonRDD[8] at RDD at PythonRDD.scala:53

In [15]:
squared_rdd.take(5)

[1, 4, 9, 16, 25]

In [16]:
squared_rdd.collect()

[1, 4, 9, 16, 25]

# 데이터프레임 객체

In [19]:
data = [ ("Alice", 1), ("Bob", 2), ("Charlie", 3)]
df = spark.createDataFrame(data, ["Name", "Value"])
df

DataFrame[Name: string, Value: bigint]

In [20]:
df.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



# RDBMS 데이터 - 테이블

In [21]:
df.createOrReplaceTempView("people")

In [26]:
select_sql = "SELECT * FROM people WHERE Value > 1"

In [27]:
result_sql = spark.sql(select_sql)
result_sql.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [28]:
# DataFrame 생성 -> select DF 추출

In [47]:
student = [ ("김철수",70), ("이영희", 100),("홍길동", 80),("김영숙", 90)]
d_frame = spark.createDataFrame(student, ["Name","Score"])
d_frame.show()

+------+-----+
|  Name|Score|
+------+-----+
|김철수|   70|
|이영희|  100|
|홍길동|   80|
|김영숙|   90|
+------+-----+



In [59]:
d_frame.createOrReplaceTempView("passer")

In [60]:
pass_sql = "SELECT * FROM passer WHERE Score > 80"

In [61]:
passer = spark.sql(pass_sql)
passer.show()

+------+-----+
|  Name|Score|
+------+-----+
|이영희|  100|
|김영숙|   90|
+------+-----+



# MLIib

In [63]:
pip install numpy

Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m161.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.4
Note: you may need to restart the kernel to use updated packages.


In [64]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [65]:
# DataFrame 생성
df = spark.createDataFrame([("Alice", 25), ("Bob", 30), ("Charlie", 35)], ["name", "age"])

# DataFrame 연산
df_filtered = df.filter(df.age > 28)
df_filtered.show()

+-------+---+
|   name|age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [70]:
# VectorAssembler:
# inputCols: 입력 열 이름의 리스트. 여기서는 ['age']를 입력으로 지정.
# outputCol: 벡터화된 결과가 저장될 열 이름. 여기서는 features로 지정.

# transform
# df에 지정된 열을 벡터로 변환하여 features 열을 추가한 새로운 DataFrame을 반환.

assembler = VectorAssembler(inputCols=['age'], outputCol='features')
vector_df = assembler.transform(df)
vector_df

DataFrame[name: string, age: bigint, features: vector]

In [74]:
# LinearRegression
# featuresCol: 모델 학습에 사용할 특징 벡터 열.
# labelCol: 예측하려는 대상 열.

# lr.fit(vector_df)는 vector_df의 데이터를 사용해 모델을 학습

lr = LinearRegression(featuresCol='features', labelCol='age')
model = lr.fit(vector_df)

24/12/03 11:42:52 WARN Instrumentation: [a0c936eb] regParam is zero, which might cause numerical instability and overfitting.


In [76]:
# transform은 머신러닝 모델이 데이터를 예측하거나 변환하는 데 사용

pred = model.transform(vector_df)
pred.show()

+-------+---+--------+------------------+
|   name|age|features|        prediction|
+-------+---+--------+------------------+
|  Alice| 25|  [25.0]|25.000000000000036|
|    Bob| 30|  [30.0]|30.000000000000004|
|Charlie| 35|  [35.0]| 34.99999999999997|
+-------+---+--------+------------------+



In [77]:
spark.stop()