In [0]:
spark.sql("USE dev")
titanic_sdf = spark.table("titanic_train")

In [0]:
titanic_pdf = titanic_sdf.select('*').toPandas()

display(titanic_sdf.limit(10))


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Spark DataFrame의 withColumn() 메소드 알아보기
* pandas DataFrame은 [] 을 이용하여 기존 컬럼값을 update, 또는 신규 컬럼 추가, 묵시적으로 컬럼 타입을 변경할 수 있음. 컬럼명 변경시는 rename()을 사용. 명시적인 컬럼 타입 변경은 astype()적용
* spark DataFrame은 withColumn() 메소드를 이용하여 기존 컬럼값을 update, 컬럼 타입 변경, 신규 컬럼값을 추가할 수 있음. 
* withColumn('신규 또는 Update되는 컬럼명', '신규 또는 update되는 값')을 인자로 가짐. 
* 신규 또는 update되는 값을 생성 시에 기존 컬럼을 기반으로 한다면 신규 컬럼은 문자열로, 기존 컬럼은 반드시 컬럼형(col('컬럼명'))을 이용하여 적용.
* 신규 컬럼값을 추가하는 것은 select() 메소드로도 가능
* 컬럼명을 변경하는 것은 withColumnRename() 메소드로 수행.

In [0]:
titanic_pdf.info()
titanic_pdf.head(10)

In [0]:
import numpy as np

titanic_pdf_copied = titanic_pdf.copy()
# Pandas DataFrame 신규 컬럼 추가
titanic_pdf_copied['Extra_Fare'] = titanic_pdf_copied['Fare'] * 10
# 기존 컬럼 update
titanic_pdf_copied['Fare'] = titanic_pdf_copied['Fare'] + 20
# 기존 컬럼의 Data Type 변경.  
titanic_pdf_copied['Fare'] = titanic_pdf_copied['Fare'].astype(np.int64)

titanic_pdf_copied.info()
titanic_pdf_copied.head()

In [0]:
from pyspark.sql.functions import col

titanic_sdf_copied = titanic_sdf.select('*') # copy titanic_sdf 

titanic_sdf_copied = titanic_sdf_copied.withColumn('Extra_Fare', col('Fare') * 10) 

titanic_sdf_copied = titanic_sdf_copied.withColumn('Fare', col('Fare') + 20)

titanic_sdf_copied = titanic_sdf_copied.withColumn('Fare', col('Fare').cast('Integer'))

# titanic_sdf_copied = titanic_sdf_copied.withColumn('Extra_Fare', col('Fare') * 10)  \
#                                 .withColumn('Fare', col('Fare') + 20)   \
#                                 .withColumn('Fare', col('Fare').cast('Integer'))

titanic_sdf_copied.printSchema()
display(titanic_sdf_copied.limit(10))

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Parch: long (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: integer (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Extra_Fare: integer (nullable = true)



PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,47,,S,270
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,111,C85,C,910
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,47,,S,270
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,93,C123,S,730
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,48,,S,280
6,0,3,"Moran, Mr. James",male,,0,0,330877,48,,Q,280
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,91,E46,S,710
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,61,,S,410
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,51,,S,310
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,70,,C,500


In [0]:
# Argument `col` should be a Column
titanic_sdf_copied = titanic_sdf_copied.withColumn('Extra_Fare', 10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkTypeError[0m                          Traceback (most recent call last)
File [0;32m<command-7378208024298002>, line 2[0m
[1;32m      1[0m [38;5;66;03m# 상수 값으로 update 시에 아래와 같이 수행하면 오류가 발생. 반드시 update할 값은 컬럼형이 되어야 함. [39;00m
[0;32m----> 2[0m titanic_sdf_copied [38;5;241m=[39m [43mtitanic_sdf_copied[49m[38;5;241;43m.[39;49m[43mwithColumn[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mExtra_Fare[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;241;43m10[39;49m[43m)[49m

File [0;32m/databricks/python/lib/python3.10/site-packages/pyspark/sql/connect/dataframe.py:934[0m, in [0;36mDataFrame.withColumn[0;34m(self, colName, col)[0m
[1;32m    932[0m [38;5;28;01mdef[39;00m [38;5;21mwithColumn[39m([38;5;28mself[39m, colName: [38;5;28mstr[39m, col: Column) [38;5;241m-[39m[38;5;241m>[39m [38;5;124m"[39m[38;5;124mDataFrame[39m[38;5;124m"[39m:


In [0]:
from pyspark.sql.functions import lit

# 상수 값으로 update 시 반드시 lit() 함수를 적용하여야 함. 
titanic_sdf_copied = titanic_sdf_copied.withColumn('Extra_Fare', lit(10))

# 상수 값으로 신규 컬럼 생성시에도 반드시 lit() 함수를 적용해야 함. 
titanic_sdf_copied = titanic_sdf_copied.withColumn('New_Name', lit('Test_name'))

display(titanic_sdf_copied.limit(10))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare,New_Name
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,47,,S,10,Test_name
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,111,C85,C,10,Test_name
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,47,,S,10,Test_name
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,93,C123,S,10,Test_name
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,48,,S,10,Test_name
6,0,3,"Moran, Mr. James",male,,0,0,330877,48,,Q,10,Test_name
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,91,E46,S,10,Test_name
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,61,,S,10,Test_name
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,51,,S,10,Test_name
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,70,,C,10,Test_name


In [0]:
from pyspark.sql.functions import col, substring

titanic_sdf_copied = titanic_sdf_copied.select('*', col('Sex').alias('Gender')) 
# select a.*, Sex as Gender from titanic_sdf a
titanic_sdf_copied = titanic_sdf_copied.select('*', substring('Cabin', 0, 1).alias('Cabin_First')) 
# select a.*, substring(Cabin, 0, 1) as Cabin_First from titanic_sdf a

# Add column by withColumn()
titanic_sdf_copied = titanic_sdf_copied.withColumn('Gender_01', col('Sex')).withColumn('Cabin_First_01', substring('Cabin', 0, 1))

display(titanic_sdf_copied.limit(10))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare,New_Name,Gender,Cabin_First,Gender_01,Cabin_First_01
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,47,,S,10,Test_name,male,,male,
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,111,C85,C,10,Test_name,female,C,female,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,47,,S,10,Test_name,female,,female,
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,93,C123,S,10,Test_name,female,C,female,C
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,48,,S,10,Test_name,male,,male,
6,0,3,"Moran, Mr. James",male,,0,0,330877,48,,Q,10,Test_name,male,,male,
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,91,E46,S,10,Test_name,male,E,male,E
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,61,,S,10,Test_name,male,,male,
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,51,,S,10,Test_name,female,,female,
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,70,,C,10,Test_name,female,,female,


In [0]:
from pyspark.sql.functions import split

titanic_sdf_copied = titanic_sdf_copied.withColumn('Name1', split(col('Name'), ',').getItem(0))
titanic_sdf_copied = titanic_sdf_copied.withColumn('Name2', split(col('Name'), ',').getItem(1))

# OR 
#titanic_sdf_copied = titanic_sdf_copied.withColumn('Name1', split(col('Name'), ',').getItem(0)).withColumn('Name2', split(col('Name'), ',').getItem(1))

display(titanic_sdf_copied.limit(10))


In [0]:
# withColumnRenamed('existing, 'new_name')
titanic_sdf_copied = titanic_sdf_copied.withColumnRenamed('Gender', 'Gender_Renamed')


display(titanic_sdf_copied.limit(10))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Extra_Fare,New_Name,Gender_Renamed,Cabin_First,Gender_01,Cabin_First_01
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,47,,S,10,Test_name,male,,male,
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,111,C85,C,10,Test_name,female,C,female,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,47,,S,10,Test_name,female,,female,
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,93,C123,S,10,Test_name,female,C,female,C
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,48,,S,10,Test_name,male,,male,
6,0,3,"Moran, Mr. James",male,,0,0,330877,48,,Q,10,Test_name,male,,male,
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,91,E46,S,10,Test_name,male,E,male,E
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,61,,S,10,Test_name,male,,male,
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,51,,S,10,Test_name,female,,female,
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,70,,C,10,Test_name,female,,female,


### Spark DataFrame의 컬럼 삭제와 로우(레코드) 삭제
* Pandas DataFrame은 drop() 메소드의 axis를 기반으로 컬럼(axis=1) 또는 로우(axis=0)를 삭제할 수 있으나
* Spark DataFrame drop() 메소드는 컬럼 삭제만 가능. 단일/여러개의 컬럼을 삭제 할 수 있음. 단 여러개의 컬럼 삭제 시 list로 입력 할 수 없으며 개별 컬럼명들이 입력되어야 함. 
* Spark DataFrame은 기본적으로는 특정 조건에 따른 로우 삭제가 어려움. 로우 삭제 대신 filter() 메소드를 이용하여 해당 조건의 데이터를 다시 만들어냄. 
* Pandas의 None 값을 Null을 의미하여 Spark에서는 null로 변환됨. 
* 값이 있는 record는 dropna() 메소드 또는 DataFrame.na.drop()을 이용하여 삭제 할 수 있음. 또는 filter() 조건에서 Not null조건으로 다시 만들어 냄.
* DataFrame.na는 DataFrameNaFunctions 객체임.

In [0]:
titanic_pdf_dropped = titanic_pdf.drop('Name', axis=1, inplace=False)
display(titanic_pdf_dropped.head())
display(titanic_pdf.head())

In [0]:
from pyspark.sql.functions import col

titanic_sdf_copied = titanic_sdf.select('*')

titanic_sdf_copied = titanic_sdf_copied.drop('Name')
titanic_sdf_copied = titanic_sdf_copied.drop(col('Sex'))

titanic_sdf_copied.limit(10).show(truncate=False)

In [0]:
from pyspark.sql.functions import col

#여러개의 컬럼을 삭제할 시 list가 아니라 단일 컬럼명들을 각각 인자로 넣어 주어야 함. 
titanic_sdf_copied.drop('Age', 'SibSp').limit(10).show()
titanic_sdf_copied.drop(col('Age'), col('SibSp')).limit(10).show(truncate=False)

+-----------+--------+------+--------------------+------+-----+----------------+----+-----+--------+----------+---------+--------------+-----------+---------+--------------+
|PassengerId|Survived|Pclass|                Name|   Sex|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare| New_Name|Gender_Renamed|Cabin_First|Gender_01|Cabin_First_01|
+-----------+--------+------+--------------------+------+-----+----------------+----+-----+--------+----------+---------+--------------+-----------+---------+--------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|    0|       A/5 21171|  47| NULL|       S|        10|Test_name|          male|       NULL|     male|          NULL|
|          2|       1|     1|Cumings, Mrs. Joh...|female|    0|        PC 17599| 111|  C85|       C|        10|Test_name|        female|          C|   female|             C|
|          3|       1|     3|Heikkinen, Miss. ...|female|    0|STON/O2. 3101282|  47| NULL|       S|        10|Test_name|        f

In [0]:
# List cannot be interpreted as a Column object by drop   
titanic_sdf_copied.drop(['Age', 'SibSp']).limit(10).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkTypeError[0m                          Traceback (most recent call last)
File [0;32m<command-7378208024298013>, line 2[0m
[1;32m      1[0m [38;5;66;03m# 아래는 오류 발생. 여러개의 컬럼들을 삭제 시 list 입력은 안됨.  [39;00m
[0;32m----> 2[0m [43mtitanic_sdf_copied[49m[38;5;241;43m.[39;49m[43mdrop[49m[43m([49m[43m[[49m[38;5;124;43m'[39;49m[38;5;124;43mAge[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;124;43m'[39;49m[38;5;124;43mSibSp[39;49m[38;5;124;43m'[39;49m[43m][49m[43m)[49m[38;5;241m.[39mlimit([38;5;241m10[39m)[38;5;241m.[39mshow()

File [0;32m/databricks/python/lib/python3.10/site-packages/pyspark/sql/connect/dataframe.py:484[0m, in [0;36mDataFrame.drop[0;34m(self, *cols)[0m
[1;32m    482[0m _cols [38;5;241m=[39m [38;5;28mlist[39m(cols)
[1;32m    483[0m [38;5;28;01mif[39;00m [38;5;28many[39m([38;5;129;01mnot[39;00m [38;5;28misinstance[3

In [0]:
drop_columns = ['Age', 'SibSp']
print(*drop_columns)

Age SibSp


In [0]:
drop_columns = ['Age', 'SibSp']
drop_columns_col = [col('Age'), col('SibSp')]

titanic_sdf_copied.drop(*drop_columns).limit(10).show()
titanic_sdf_copied.drop(*drop_columns_col).limit(10).show()

+-----------+--------+------+--------------------+------+-----+----------------+----+-----+--------+----------+---------+--------------+-----------+---------+--------------+
|PassengerId|Survived|Pclass|                Name|   Sex|Parch|          Ticket|Fare|Cabin|Embarked|Extra_Fare| New_Name|Gender_Renamed|Cabin_First|Gender_01|Cabin_First_01|
+-----------+--------+------+--------------------+------+-----+----------------+----+-----+--------+----------+---------+--------------+-----------+---------+--------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|    0|       A/5 21171|  47| NULL|       S|        10|Test_name|          male|       NULL|     male|          NULL|
|          2|       1|     1|Cumings, Mrs. Joh...|female|    0|        PC 17599| 111|  C85|       C|        10|Test_name|        female|          C|   female|             C|
|          3|       1|     3|Heikkinen, Miss. ...|female|    0|STON/O2. 3101282|  47| NULL|       S|        10|Test_name|        f

In [0]:
titanic_sdf_copied.dtypes

In [0]:
# 아래와 같이 logic으로 조건에 맞는 여러개의 컬럼들을 삭제할 수 있음. 
drop_string_columns = [ column_name for column_name, column_type in titanic_sdf_copied.dtypes if column_type == 'string']
print('drop 컬럼명:', drop_string_columns)
titanic_sdf_copied.drop(*drop_string_columns).limit(10).show()


In [0]:
display(titanic_sdf.limit(10))

In [0]:
# Spark DataFrame은 특정 조건으로 로우를 삭제하기가 어려우므로 filter()로 특정 조건에 해당하지 않는 로우를 걸러내는 방식을 적용. 
titanic_sdf_removed_Embarked_C = titanic_sdf.filter(col('Embarked') != 'C')
titanic_sdf_removed_Embarked_C.show()

In [0]:
titanic_pdf.info()

In [0]:
import pyspark.sql.functions as f
titanic_sdf.select(
    [f.count(f.when(f.col(col_name).isNull(), col_name)).alias(col_name) for col_name in titanic_sdf.columns]
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [0]:
from pyspark.sql.functions import isnan

print('titanic_sdf count:', titanic_sdf.count())

# DataFrame.dropna() delete all 'rows' that include any null values
titanic_sdf_dropna_01 = titanic_sdf.dropna()
print('after dropna() count:', titanic_sdf_dropna_01.count())


titanic_sdf count: 891
after dropna() count: 183


In [0]:
# 특정 컬럼에 Null 이 있는 경우에만 삭제할 경우
titanic_sdf_dropna_03 = titanic_sdf.na.drop(subset=["Age", "Embarked"]) 
print('DataFrame.na.drop()을 Age와 Embarked 컬럼 적용 후 count:', titanic_sdf_dropna_03.count())

# select count(*) from titanic_sdf where age is not null and embarked is not null
print('Age와 Embarked가 모두 Not Null인 count:', titanic_sdf.filter(col('Age').isNotNull() & col('Embarked').isNotNull()).count())

In [0]:
# dropna() 메소드를 로직으로 구현. 
where_str = ''
column_count = len(titanic_sdf.columns)
for index, column_name in enumerate(titanic_sdf.columns):
    where_str += (column_name +' IS NOT NULL ') 
    if index < column_count - 1:
        where_str += 'and '
print(where_str)

In [0]:
# 위에서 만든 where_str을 filter() 인자로 입력하여 해당 조건을 만족하는 DataFrame 추출. 
titanic_sdf.filter(where_str).count()

### Pandas와 Spark에서의 None, Null, NaN 의 구분 - 1
* Python은 None이라는 값이 없는 내장 상수가 있음. None 객체라고도 부리면 이는 NoneType 클래스임. 
* SQL은 원론적으로 None이 아니라 Null 임. 
* numpy는 python None은 처리하기 위해 object 형으로 None을 할당할수 있고, float 형으로 NaN을 할당 할 수 있음. 
* NaN은 원래 Not a Number라는 의미임. 숫자형 array에 값이 없을 경우에는 NaN을 할당함. 
* pandas는 csv와 같은 파일에서 로드 시 특정 컬럼에 데이터가 없을 경우에 문자열 컬럼일 경우 None으로 숫자형 컬럼일 경우 NaN으로 할당. 단 NaN 으로 할당 시에는 int형 컬럼이라도 float형으로 변경됨. 
* Spark는 csv와 같은 파일에서 로드 시 모든 컬럼을 다 Null로 변환. 기본적으로 None은 Null에 할당. 이는 SQL사상과 동일. 
* 하지만 Spark는 pandas DataFrame의 NaN 처리와 어느정도 호환성을 유지하기 위해 NaN도 함께 지원.
* 과거 버전 Spark(Spark 3.0 이하)는 pandas DataFrame을 spark로 변환 시에 NaN 값을 동일하게 NaN으로 변환하였으나 현재는 null로 변환함. 하지만 NaN 값을 명확하게 지정하여 spark DataFrame을 만들 수 있음.
* 결론적으로 NaN은 고려하지 않고 Null만 고려할 수 있도록 Spark DataFrame을 만드는 것이 중요. isnan()은 사용하지 않고, isNull()만 사용할 수 있도록 유도.

In [0]:
val = None
print(type(val), val)

<class 'NoneType'> None


In [0]:
titanic_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int32  
 1   Survived     891 non-null    int32  
 2   Pclass       891 non-null    int32  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int32  
 7   Parch        891 non-null    int32  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int32(5), object(5)
memory usage: 66.3+ KB


In [0]:
# Spark DataFrame은 숫자형 컬럼인 Age와 문자열 컬럼인 Cabin 모두 Null로 변환. 
titanic_sdf.limit(10).show(truncate=False)

# pandas DataFrame은 Age 컬럼의 경우 숫자형이므로 NaN으로, Cabin 컬럼은 문자열이므로 None으로 변환됨. 
titanic_pdf.head(10)


+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                             |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35.0|1    |0    |113803          |53

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [0]:
import pandas as pd
import numpy as np

# None은 object형 array에만 넣을 수 있음. 
array = np.array([0, 1, 2, None], dtype=np.object)
print(array)

# None을 숫자형(np.int는 안되고 np.float)로 입력할 경우는 NaN으로 입력됨. 
array = np.array([0, 1, 2, None], dtype=np.float)
print(array)

array = np.array([0, 1, 2, np.NaN], dtype=np.float)
print(array)

[0 1 2 None]
[ 0.  1.  2. nan]
[ 0.  1.  2. nan]


In [0]:
print('Cabin 컬럼을 numpy 변환 시 array의 type:', titanic_pdf['Cabin'].to_numpy().dtype)
print('Age 컬럼을 numpy 변환 시 array의 type:', titanic_pdf['Age'].to_numpy().dtype)

In [0]:
# spark 3.2 버전에서는 pandas DataFrame으로 Spark DataFrame을 생성해도 NaN, Null 모두 null로 변환. 
titanic_sdf_from_pandas = spark.createDataFrame(titanic_pdf)
titanic_sdf_from_pandas.show(truncate=False)

+-----------+--------+------+-------------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                                   |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+-------------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                                |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)    |female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                                 |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)           |female|35.0|1  

In [0]:
# Spark 3.2 에서는 pandas DataFrame의 NaN을 spark DataFrame으로 변환 시 Null로 변환. spark 3.0 이하 버전에서는 NaN 으로 변환
pdf = pd.DataFrame({
    "x": [1, np.NaN], "y": [None, "foo"]
})
sdf = spark.createDataFrame(pdf)

sdf.show()

+----+----+
|   x|   y|
+----+----+
| 1.0|null|
|null| foo|
+----+----+



In [0]:
# 직접 NaN값을 지정하여 입력할 경우 Spark DataFrame에 NaN 입력 가능. 
sdf = spark.createDataFrame([(1.0, None), (float('nan'), 'foo')], ("x", "y"))
sdf.show()

+---+----+
|  x|   y|
+---+----+
|1.0|null|
|NaN| foo|
+---+----+



### Spark DataFrame에서 Null과 NaN 찾기
* pandas DataFrame의 isnull()과 isna()는 서로 동일한 메소드임. isnull(), isna() 모두 None과 NaN을 모두 찾음. 
* spark DataFrame isNull()은 null만 찾아줌, isnan()은 NaN만 찾음. 또한 isNull()은 컬럼 조건에 붙어서 filter()메소드와 함께 사용되며, isnan()은 pyspark.sql.functions의 함수로 정의됨.
* spark DataFrame의 dropna() 메소드는 NaN과 Null 모두를 찾아서 삭제해줌.
* Not Null 조건으로 찾을 때는 isNotNull() 적용.

In [0]:
print(titanic_pdf[['Age', 'Cabin']].head(10))
print('### isna() 적용 결과 ### ')
print(titanic_pdf[['Age', 'Cabin']].isna().head(10))

print('### isnull() 적용 결과 ### ')
print(titanic_pdf[['Age', 'Cabin']].isnull().head(10))

    Age Cabin
0  22.0  None
1  38.0   C85
2  26.0  None
3  35.0  C123
4  35.0  None
5   NaN  None
6  54.0   E46
7   2.0  None
8  27.0  None
9  14.0  None
### isna() 적용 결과 ### 
     Age  Cabin
0  False   True
1  False  False
2  False   True
3  False  False
4  False   True
5   True   True
6  False  False
7  False   True
8  False   True
9  False   True
### isnull() 적용 결과 ### 
     Age  Cabin
0  False   True
1  False  False
2  False   True
3  False  False
4  False   True
5   True   True
6  False  False
7  False   True
8  False   True
9  False   True


In [0]:
titanic_sdf.limit(10).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [0]:
from pyspark.sql.functions import col, isnan

#isNull()은 컬럼 조건에 붙어서 filter()메소드와 함께 사용. isnan()은 pyspark.sql.functions의 함수로 사용. 
print('##### isNull() 적용 결과 #####')
titanic_sdf.filter(col('Age').isNull()).show(10) # select * from titanic_sdf where age is Null
#titanic_sdf.filter('Age is Null').show(10)
print('##### isnan() 함수 적용 결과 #####')
titanic_sdf.where(isnan(col('Age'))).show()

##### isNull() 적용 결과 #####
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----+--------+
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|  330877|  8.4583| null|       Q|
|         18|       1|     2|Williams, Mr. Cha...|  male|null|    0|    0|  244373|    13.0| null|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|    2649|   7.225| null|       C|
|         27|       0|     3|Emir, Mr. Farred ...|  male|null|    0|    0|    2631|   7.225| null|       C|
|         29|       1|     3|"O'Dwyer, Miss. E...|female|null|    0|    0|  330959|  7.8792| null|       Q|
|         30|       0|     3| Todoroff, Mr. Lalio|  male|null|    0|    0|  349216|  7.8958| null|       S|
|

In [0]:
# spark DataFrame의 dropna()와 DataFrame.na.drop()은 Null 또는 NaN 모두를 찾아서 삭제해줌. 
sdf = spark.createDataFrame([(1.0, None), (float('nan'), 'foo')], ("x", "y"))
print(sdf.show())
sdf.dropna().show()
sdf.na.drop().show()

+---+----+
|  x|   y|
+---+----+
|1.0|null|
|NaN| foo|
+---+----+

None
+---+---+
|  x|  y|
+---+---+
+---+---+

+---+---+
|  x|  y|
+---+---+
+---+---+



In [0]:
titanic_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int32  
 1   Survived     891 non-null    int32  
 2   Pclass       891 non-null    int32  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int32  
 7   Parch        891 non-null    int32  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int32(5), object(5)
memory usage: 66.3+ KB


#### Null이 있는 컬럼명과 Null 건수를 찾기

In [0]:
from pyspark.sql.functions import isnan

print([col(column_name) for column_name in titanic_sdf.columns])

display(titanic_sdf.select([col(column_name).isNull() for column_name in titanic_sdf.columns]))
display(titanic_sdf.select([col(column_name).isNull().alias(column_name) for column_name in titanic_sdf.columns]))

[Column<'PassengerId'>, Column<'Survived'>, Column<'Pclass'>, Column<'Name'>, Column<'Sex'>, Column<'Age'>, Column<'SibSp'>, Column<'Parch'>, Column<'Ticket'>, Column<'Fare'>, Column<'Cabin'>, Column<'Embarked'>]


(PassengerId IS NULL),(Survived IS NULL),(Pclass IS NULL),(Name IS NULL),(Sex IS NULL),(Age IS NULL),(SibSp IS NULL),(Parch IS NULL),(Ticket IS NULL),(Fare IS NULL),(Cabin IS NULL),(Embarked IS NULL)
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,True,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,True,False


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,True,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,True,False
False,False,False,False,False,False,False,False,False,False,True,False


In [0]:
from pyspark.sql.functions import count, when

# select count(case when passengerid is null then passengerId), count(case when survived is null then survived),,,,, from titanic_sdf
display(titanic_sdf.select([count(when (col(column_name).isNull(), column_name)) for column_name in titanic_sdf.columns]))

count(CASE WHEN (PassengerId IS NULL) THEN PassengerId END),count(CASE WHEN (Survived IS NULL) THEN Survived END),count(CASE WHEN (Pclass IS NULL) THEN Pclass END),count(CASE WHEN (Name IS NULL) THEN Name END),count(CASE WHEN (Sex IS NULL) THEN Sex END),count(CASE WHEN (Age IS NULL) THEN Age END),count(CASE WHEN (SibSp IS NULL) THEN SibSp END),count(CASE WHEN (Parch IS NULL) THEN Parch END),count(CASE WHEN (Ticket IS NULL) THEN Ticket END),count(CASE WHEN (Fare IS NULL) THEN Fare END),count(CASE WHEN (Cabin IS NULL) THEN Cabin END),count(CASE WHEN (Embarked IS NULL) THEN Embarked END)
0,0,0,0,0,177,0,0,0,0,687,2


In [0]:
from pyspark.sql.functions import isnan

display(titanic_sdf.select([count(when (col(c).isNull(), c)).alias(c) for c in titanic_sdf.columns]))

# 일반적으로 isnan()은 거의 존재 하지 않음. 아예 isnan()이 존재하는지 먼저 확인하는게 필요. 
display(titanic_sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in titanic_sdf.columns]))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,177,0,0,0,0,687,2


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,177,0,0,0,0,687,2


In [0]:
display(titanic_sdf.select([count(when(isnan(c), c)).alias(c) for c in titanic_sdf.columns]))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
titanic_sdf.filter(col('Age').isNotNull()).show(10) # select * from titanic_sdf where age is not null

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      

In [0]:
# Spark DataFrame의 NaN을 Null로 변환하기. 
sdf = spark.createDataFrame([(1.0, None), (float('nan'), 'foo')], ("x", "y"))
print(sdf.show())
sdf.replace(float('nan'), None).show()
# fillna(None) 은 오류 발생. 
sdf.fillna(value=None)

+---+----+
|  x|   y|
+---+----+
|1.0|NULL|
|NaN| foo|
+---+----+

None
+----+----+
|   x|   y|
+----+----+
| 1.0|NULL|
|NULL| foo|
+----+----+



[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkTypeError[0m                          Traceback (most recent call last)
File [0;32m<command-7378208024298047>, line 6[0m
[1;32m      4[0m sdf[38;5;241m.[39mreplace([38;5;28mfloat[39m([38;5;124m'[39m[38;5;124mnan[39m[38;5;124m'[39m), [38;5;28;01mNone[39;00m)[38;5;241m.[39mshow()
[1;32m      5[0m [38;5;66;03m# fillna(None) 은 오류 발생. [39;00m
[0;32m----> 6[0m sdf[38;5;241m.[39mfillna(value[38;5;241m=[39m[38;5;28;01mNone[39;00m)

File [0;32m/databricks/python/lib/python3.10/site-packages/pyspark/sql/connect/dataframe.py:1251[0m, in [0;36mDataFrame.fillna[0;34m(self, value, subset)[0m
[1;32m   1245[0m [38;5;28;01mdef[39;00m [38;5;21mfillna[39m(
[1;32m   1246[0m     [38;5;28mself[39m,
[1;32m   1247[0m     value: Union[[38;5;124m"[39m[38;5;124mLiteralType[39m[38;5;124m"[39m, Dict[[38;5;28mstr[39m, [38;5;124m"[39m[38;5;124mLiteralType[39

### 결손(Null) 데이터 처리하기
* DataFrame의 fillna() 메소드, 또는 DataFrameNaFunctions 객체인 DataFrame.na의 fill() 메소드를 이용
* DataFrame.fillna(value=값, subset=['컬럼1', 컬럼2])로 형태로 사용. value는 결측값에 입력될 값, subset은 대상 컬럼. subset을 지정하지 않으면 전체 컬럼에 적용. 
* subset을 지정하지 않고 value에 숫자값을 입력하면 숫자형 컬럼만 결손값을 처리함. 비슷하게 value에 문자값을 입력하면 문자형 컬럼만 결손값을 처리함.
* value는 반드시 단일 값이 들어가야함. 단일 값을 가지는 DataFrame은 안됨.

In [0]:
titanic_pdf['Age'] = titanic_pdf['Age'].fillna(titanic_pdf['Age'].mean(), inplace=False)

titanic_pdf.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_filled
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q,999.0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,2.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,27.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,14.0


In [0]:
type(titanic_sdf.na)

Out[84]: pyspark.sql.dataframe.DataFrameNaFunctions

In [0]:
print('subset을 지정하지 않고 숫자형 컬럼에 결측치 처리')
titanic_sdf.fillna(value=999).show(10)
titanic_sdf.na.fill(value=999).show(10)

print('subset을 지정하지 않고 문자형 컬럼에 결측치 처리')
titanic_sdf.fillna(value='NA').show(10)

subset을 지정하지 않고 숫자형 컬럼에 결측치 처리
+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|  Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, M

In [0]:
print('Age 컬럼 결측치 처리')
titanic_sdf.fillna(value=999, subset=['Age']).show(10) # titanic_pdf['Age'].fillna(999, inplace=False)

print('Cabin 컬럼 결측치 처리')
titanic_sdf.fillna(value='NA', subset=['Cabin']).show(10) # titanic_pdf['Cabin'].fillna('NA', inplace=False)


Age 컬럼 결측치 처리
+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|  Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|9

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import avg, col

avg_age = titanic_sdf.select(F.avg(F.col('Age'))) # select avg(age) from titanic_sdf
print(avg_age.show())
print('### avg_age type:', type(avg_age))


+-----------------+
|         avg(Age)|
+-----------------+
|29.69911764705882|
+-----------------+

None
### avg_age type: <class 'pyspark.sql.connect.dataframe.DataFrame'>


In [0]:
# 아래는 오류를 발생시킴. value 인자로 단일 값이 입력되어야 함. DataFrame은 입력 될 수 없음. 
titanic_sdf.fillna(value=avg_age, subset=['Age'])

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-1656680289675160>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m# 아래는 오류를 발생시킴. value 인자로 단일 값이 입력되어야 함. DataFrame은 입력 될 수 없음.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mtitanic_sdf[0m[0;34m.[0m[0mfillna[0m[0;34m([0m[0mvalue[0m[0;34m=[0m[0mavg_age[0m[0;34m,[0m [0msubset[0m[0;34m=[0m[0;34m[[0m[0;34m'Age'[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/dataframe.py[0m in [0;36mfillna[0;34m(self, value, subset)[0m
[1;32m   2109[0m         """
[1;32m   2110[0m         [0;32mif[0m [0;32mnot[0m [0misinstance[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0;34m([0m[0mfloat[0m[0;34m,[0m [0mint[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mbool[0m[0;34m,[0m [0mdict[0m[0;34m)[

In [0]:
# first()는 head()와 동일하게 동작. 하지만 first(N)은 존재하지 않으며 first()는 맨 처음 Row만 가져옴. 
avg_age_row = avg_age.first()
print(avg_age_row, type(avg_age_row))

# 아래는 DataFrame의 단일 Row에서 맨 첫번째 개별 value를 가져옴. 
avg_age_value = avg_age.first()[0]
print(avg_age_value, type(avg_age_value))

Row(avg(Age)=29.69911764705882) <class 'pyspark.sql.types.Row'>
29.69911764705882 <class 'float'>


In [0]:
titanic_sdf.fillna(value=avg_age_value, subset=['Age']).show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|             35.0|    0|    0|          373450|

In [0]:
# DataFrame.head(N)은 DataFrame의 맨 위에서 부터 N개 까지의 Row를 List로 가져옴. 
# 만약 head()로 호출 시, 즉 N이 없으면 1개 Row를 가져오되 List가 아닌 단일 Row로 가져옴. 만약 head(1)로 하면 1개의 Row이어도 list로 가져옴.
# 단일 Row는 Tuple과 유사.
avg_age_row = avg_age.head()
print(avg_age_row, type(avg_age_row))

# 아래는 DataFrame의 단일 Row에서 맨 첫번째 개별 value를 가져옴. 
avg_age_value = avg_age.head()[0]
print(avg_age_value, type(avg_age_value))

Row(avg(Age)=29.69911764705882) <class 'pyspark.sql.types.Row'>
29.69911764705882 <class 'float'>


In [0]:
titanic_sdf.fillna(value=avg_age_value, subset=['Age']).show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|             35.0|    0|    0|          373450|

In [0]:
print("head()에 N이 없으므로 1개의 단일 Row를 반환:\n", titanic_sdf.head())
print("head(1)로 1개의 Row를 가져오지만 List로 반환:\n", titanic_sdf.head(1))

print("단일 Row의 첫번째 요소를 가져옴:\n", titanic_sdf.head()[0])
print("List에서 첫번째 Row를 가져옴:\n", titanic_sdf.head(1)[0])
print("List의 첫번째 Row에서 첫번째 요소를 가져옴:\n", titanic_sdf.head(1)[0][0])


head()에 N이 없으므로 1개의 단일 Row를 반환:
 Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')
head(1)로 1개의 Row를 가져오지만 List로 반환:
 [Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')]
단일 Row의 첫번째 요소를 가져옴:
 1
List에서 첫번째 Row를 가져옴:
 Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')
List의 첫번째 Row에서 첫번째 요소를 가져옴:
 1


### 사용자 정의 함수(User Defined Function)을 DataFrame 가공 시 적용하는 법과 when 사용법
* UDF를 Spark DataFrame에 적용하려면 먼저 일반 함수를 만든 후에 이를 spark의 udf() 함수를 이용하여 DataFrame에서 사용할 수 있도록 변환해야 함. 
* pyspark.sql.functions의 when()은 SQL의 Case When Then... Else 구문과 동일하게 동작.

In [0]:
import pyspark.sql.functions as F

avg_age = titanic_sdf.select(F.avg(F.col('Age')))
avg_age_row = avg_age.head()
avg_age_value = avg_age.head()[0]

# Spark DataFrame의 fillna()에 인자로 Dict를 입력하여 여러개의 컬럼들에 대해서 결측치 값을 입력할 수 있게 만들어줌. 
titanic_sdf_filled = titanic_sdf.fillna({'Age': avg_age_value, 
                                         'Cabin': 'C000',
                                         'Embarked': 'S'
})

In [0]:
titanic_sdf_filled.show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25| C000|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| C000|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|             35.0|    0|    0|          373450|

In [0]:
from pyspark.sql.functions import StringType

@udf(StringType())
def get_category(age):
    cat = ''
    
    if age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else : cat = 'Elderly'
    
    return cat

get_category = spark.udf.register("get_category", get_category)

titanic_sdf_filled_01 = titanic_sdf_filled.withColumn("Age_Category", get_category(col("Age")))
display(titanic_sdf_filled_01)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Category
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C000,S,Student
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C000,S,Young Adult
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Young Adult
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C000,S,Young Adult
6,0,3,"Moran, Mr. James",male,29.69911764705882,0,0,330877,8.4583,C000,Q,Young Adult
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Adult
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,C000,S,Baby
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,C000,S,Young Adult
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C000,C,Teenager


In [0]:
# pandas DataFrame에서 apply lambda 식으로 데이터 가공하기. age 값이 None/NaN 일 경우에도 else 조건에 의해 Elderly로 변환. 
titanic_pdf['Age_category'] = titanic_pdf['Age'].apply(lambda x: get_category(x))
titanic_pdf.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Student
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Young Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Young Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Young Adult
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Elderly
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Baby
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Young Adult
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Teenager


In [0]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType

# 일반 python용 UDF를 pyspark용 UDF로 변환. udf(lambda 입력변수: 일반 UDF, 해당 일반 UDF의 반환형)
udf_get_category = udf(lambda x:get_category(x), StringType() )

In [0]:
# udf_get_category()에 Age 컬럼값을 입력하여 반환되는 값으로 새로운 컬럼 Age_Category를 생성
titanic_sdf_filled_01 = titanic_sdf_filled.withColumn("Age_Category",udf_get_category(col("Age")))
titanic_sdf_filled_01.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Age_Category|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Student|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|       Adult|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S| Young Adult|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S| Young Adult|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null

In [0]:
from pyspark.sql.functions import when
                   
titanic_sdf_filled_02 = titanic_sdf_filled.withColumn('Age_category', when(F.col('Age') <= 5, 'Baby')
                                                                      .when(F.col('Age') <= 12, 'Child')
                                                                      .when(F.col('Age') <= 18, 'Teenage')
                                                                      .when(F.col('Age') <= 25, 'Student')
                                                                      .when(F.col('Age') <= 35, 'Young Adult')
                                                                      .when(F.col('Age') <= 60, 'Adult')
                                                                      .when(F.col('Age').isNull(), 'NA')
                                                                      .otherwise('Elderly'))

titanic_sdf_filled_02.limit(10).show()

''' Select a.*, CASE WHEN age <=6 THEN 'Baby'
                 WHEN age <=12 Then 'Child'
                 WHEN age <= 18 THEN 'Teenage'
                 WHEN age <= 25 THEN 'Student'
                 WHEN age <=35 THEN 'Young Adult'
                 WHEN age <=60 THEN 'Adult'
                 WHEN age is Null THEN 'NA'
                 ELSE 'Elderly' END from titanic_sdf a;
'''

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Age_category|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Student|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|       Adult|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S| Young Adult|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S| Young Adult|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null

In [0]:
from pyspark.sql.functions import expr, col

titanic_sdf_filled_03 = titanic_sdf.withColumn('Age_category', expr("CASE WHEN age = 12 THEN 'Child' " + 
                                               " WHEN Age <= 18 THEN 'Teenage' " +
                                               " WHEN Age <= 25 THEN 'Student' " +
                                               " WHEN Age <= 35 THEN 'Young Adult' " + 
                                               " WHEN Age <= 60 THEN 'Adult' " + 
                                               " WHEN Age IS NULL THEN 'NA' " +
                                               " ELSE 'Elderly' END "))
titanic_sdf_filled_03.limit(10).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Age_category|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|     Student|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|       Adult|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S| Young Adult|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S| Young Adult|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL