In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()
spark

In [4]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)  # 方便jupyter展示
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
4,5.0,string3,2000-03-01,2000-01-03 12:00:00


### 用户自定义函数udf

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

In [8]:
slen = udf(lambda s: len(s), IntegerType())

df.select(slen('c').alias('len_c')).show()

+-----+
|len_c|
+-----+
|    7|
|    7|
|    7|
+-----+



In [11]:
@udf
def to_upper(s):
    if s:
        return s.upper()
    
@udf(returnType=IntegerType())
def add_one(x):
    if x:
        return x + 1

In [12]:
df.select(slen('c').alias('len_c'), to_upper('c').alias('upper_c'), add_one('a').alias('a+1')).show()

+-----+-------+---+
|len_c|upper_c|a+1|
+-----+-------+---+
|    7|STRING1|  2|
|    7|STRING2|  3|
|    7|STRING3|  5|
+-----+-------+---+



In [13]:
df2 = df.select(slen('c').alias('len_c'), to_upper('c').alias('upper_c'), add_one('a').alias('a+1'))
df2.show()

+-----+-------+---+
|len_c|upper_c|a+1|
+-----+-------+---+
|    7|STRING1|  2|
|    7|STRING2|  3|
|    7|STRING3|  5|
+-----+-------+---+



In [14]:
type(df2)

pyspark.sql.dataframe.DataFrame

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56720)
Traceback (most recent call last):
  File "d:\Anaconda\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "d:\Anaconda\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "d:\Anaconda\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "d:\Anaconda\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "d:\Anaconda\Lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "d:\Anaconda\Lib\site-packages\pyspark\accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "d:\Anaconda\Lib\site-packages\pyspark\accumulators.py", line 271, in accum_updates
    num_updates = read_int(self.rfile)
