In [3]:
# [+] SparkSession 설정
from pyspark.sql import SparkSession

ss = SparkSession.builder.master('local').appName('udf').getOrCreate()

In [4]:
# 샘플 데이터: 한신대 맛집 및 대표메뉴
hsu_restaurants = [
    ('진현가든', '삼치돌솥밥', '경기 오산시 양산로 340딩', 3.5),
    ('짜장면가', '중화비빔밥, 마파두부밥', '경기 오산시 한신대길 123 경원빌딩', 4.0),
    ('찌개동아리', '제육전골', '경기 오산시 한신대133번길 4', 3.5),
    ('한판삼겹', '항정살', '경기 오산시 양산로410번길 8', 3.5),
    ('화락', '초밥', '경기 오산시 양산로 347 대성빌딩1층', 3.5),
    ('해우리', '고기 덮밥, 해물 라면', '경기 오산시 한신대길 135 1층', 4.5),
    ('행복한콩박사', '맑은순두부', '경기 오산시 양산로398번길 8-11', 3.5)
]


In [5]:
# [+] 스키마 정의
schema=['restaurant_name','speciality','adress','score']

In [6]:
# [+] 데이터프레임 생성
df = ss.createDataFrame(data=hsu_restaurants, schema=schema)

In [7]:
# [+] 데이터프레임 출력
df.show()

+---------------+----------------------+---------------------------------+-----+
|restaurant_name|            speciality|                           adress|score|
+---------------+----------------------+---------------------------------+-----+
|       진현가든|            삼치돌솥밥|         경기 오산시 양산로 340딩|  3.5|
|       짜장면가|중화비빔밥, 마파두부밥|경기 오산시 한신대길 123 경원빌딩|  4.0|
|     찌개동아리|              제육전골|      경기 오산시 한신대133번길 4|  3.5|
|       한판삼겹|                항정살|      경기 오산시 양산로410번길 8|  3.5|
|           화락|                  초밥|   경기 오산시 양산로 347 대성...|  3.5|
|         해우리|  고기 덮밥, 해물 라면|     경기 오산시 한신대길 135 1층|  4.5|
|   행복한콩박사|            맑은순두부|   경기 오산시 양산로398번길 8-11|  3.5|
+---------------+----------------------+---------------------------------+-----+



In [8]:
# [+] 데이터프레임 스키마 출력
df.printSchema()

root
 |-- restaurant_name: string (nullable = true)
 |-- speciality: string (nullable = true)
 |-- adress: string (nullable = true)
 |-- score: double (nullable = true)



In [9]:
# [+] Temporary View 생성
df.createOrReplaceTempView('restaurants')

### User Defined Function 생성
1. translate(): Google Translation API를 이용하여 한글 식당 이름을 영문으로 번역하는 함수
2. scale_score(): 5점 만점 평점을 100점 스케일로 변환하는 함수

In [10]:
# Google translation 라이브러리 설치
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
Collecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
Collecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2.py3-none-any.whl (12 kB)
Collecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-none-any.whl (38 kB)
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py): started
  Building wheel for googletrans (setup.py): finished with status 'done'
  Creat

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 4.2.5 requires pyqt5<5.13, which is not installed.
spyder 4.2.5 requires pyqtwebengine<5.13, which is not installed.
conda-repo-cli 1.0.4 requires pathlib, which is not installed.
anaconda-project 0.9.1 requires ruamel-yaml, which is not installed.


In [11]:
# [+] Google translator 임포트
from googletrans import Translator

In [12]:
# [+] 번역 테스트
translator = Translator()
result = translator.translate('방학이 빨리 왔으면 좋겠습니다.',src='ko',dest='en')

In [13]:
# [+] 번역 결과 출력
print(result)

Translated(src=ko, dest=en, text=I hope the vacation comes quickly., pronunciation=None, extra_data="{'confiden...")


In [15]:
result.text

'I hope the vacation comes quickly.'

In [16]:
# [+] extra_data 출력
result.extra_data

{'confidence': None,
 'parts': [<googletrans.models.TranslatedPart at 0x225c1e0e7f0>],
 'origin_pronunciation': 'banghag-i ppalli wass-eumyeon johgessseubnida.',
 'parsed': [['banghag-i ppalli wass-eumyeon johgessseubnida.',
   None,
   None,
   [[[0, [[[None, 17]], [True]]]], 17],
   [['방학이 빨리 왔으면 좋겠습니다.', None, None, 17]]],
  [[[None,
     None,
     None,
     None,
     None,
     [['I hope the vacation comes quickly.',
       None,
       None,
       None,
       [['I hope the vacation comes quickly.', [5]],
        ['I hope the vacation is coming soon.', [11]]]]]]],
   'en',
   1,
   'ko',
   ['방학이 빨리 왔으면 좋겠습니다.', 'ko', 'en', True]],
  'ko']}

In [17]:
# [+] 영어 발음(pronunciation) 출력
result.extra_data.get('origin_pronunciation')

'banghag-i ppalli wass-eumyeon johgessseubnida.'

In [18]:
# [+] UDF 1: 한글->영문 번역 함수

def translate(text):
    from googletrans import Translator
    translator=Translator()
    result = translator.translate(text, src='ko', dest='en')
    result.extra_data.get('origin_pronunciation')
    

In [19]:
# [+] UDF 등록
ss.udf.register('translate',translate)

<function __main__.translate(text)>

In [23]:
# [+] SQL문 처리
ss.sql('SELECT restaurant_name, \
        translate(restaurant_name)AS restaurant_eng_name, \
        speciality, address, score 
        FROM restaurants').show()

SyntaxError: EOL while scanning string literal (<ipython-input-23-acb1f97ffb36>, line 4)

In [None]:
# [+] Annotation 방식으로 UDF 등록하기


In [24]:
# [+] UDF 2: score -> 100점 스케일 변환 함수
def scale_score(score):
    return score*20

In [25]:
ss.udf.register('scale_score',scale_score)

<function __main__.scale_score(score)>

In [28]:
# [+] SQL문처리
ss.sql('SELECT restaurant_name, translate(restaurant_name)AS restaurant_eng_name, \
        speciality, address, scale_score(score) AS scaled_score \
        FROM restaurants').show()

AnalysisException: cannot resolve 'address' given input columns: [restaurants.adress, restaurants.restaurant_name, restaurants.score, restaurants.speciality]; line 1 pos 94;
'Project [restaurant_name#0, translate(restaurant_name#0) AS restaurant_eng_name#37, speciality#1, 'address, scale_score(score#3) AS scaled_score#38]
+- SubqueryAlias restaurants
   +- View (`restaurants`, [restaurant_name#0,speciality#1,adress#2,score#3])
      +- LogicalRDD [restaurant_name#0, speciality#1, adress#2, score#3], false
