In [1]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 50 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 67.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=a3cb4d945ba6d841654fb566c9c87e5c074e08f05a0dace02f9843a8d6d44a2e
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
from pyspark.sql import SparkSession, Row
ss = SparkSession.builder.getOrCreate()

In [3]:
!nohup ssh -o StrictHostKeyChecking=no -R mars.ru77.ru:40014:*:4040 aig@mars.ru77.ru -p 2222 &

nohup: appending output to 'nohup.out'


In [4]:
import pandas as pd

x = pd.Series([1,2,3])

In [5]:
def multiply_func(a, b):
  return a * b

In [7]:
multiply_func(x, x)

0    1
1    4
2    9
dtype: int64

In [6]:
df = ss.createDataFrame(pd.DataFrame(x, columns=["x"]))

In [8]:
df.show()

+---+
|  x|
+---+
|  1|
|  2|
|  3|
+---+



In [9]:
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import LongType

In [10]:
from numpy.ma.core import multiply
multiply = pandas_udf(multiply_func, returnType=LongType())

In [12]:
df.select(multiply(col("x"), col("x"))).show()

+-------------------+
|multiply_func(x, x)|
+-------------------+
|                  1|
|                  4|
|                  9|
+-------------------+



In [15]:
def multiply_func(a: pd.Series, b: pd.Series) -> pd.Series:
  return a * b

multiply = pandas_udf(multiply_func, returnType=LongType())

In [13]:
from typing import Iterator

In [14]:
@pandas_udf("long")
def plus_one(data: Iterator[pd.Series]) -> Iterator[pd.Series]:
  for v in data:
    yield v + 1

In [16]:
df.select(plus_one("x")).show()

+-----------+
|plus_one(x)|
+-----------+
|          2|
|          3|
|          4|
+-----------+



In [17]:
from typing import Tuple

def multiply_func(data: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
  for a,b in data:
    yield a * b

In [18]:
df.select(multiply(col("x"), col("x"))).show()

+-------------------+
|multiply_func(x, x)|
+-------------------+
|                  1|
|                  4|
|                  9|
+-------------------+



In [21]:
from pyspark.sql.functions import PandasUDFType

In [19]:
df = ss.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
    ("id", "v"))

In [22]:
df.show()

+---+----+
| id|   v|
+---+----+
|  1| 1.0|
|  1| 2.0|
|  2| 3.0|
|  2| 5.0|
|  2|10.0|
+---+----+



In [23]:
@pandas_udf("id long, v double", functionType=PandasUDFType.GROUPED_MAP)
def substract_mean(data: pd.DataFrame):
  return data.assign(v=data.v - data.v.mean())

In [24]:
df.groupby("id").apply(substract_mean).show()



+---+----+
| id|   v|
+---+----+
|  1|-0.5|
|  1| 0.5|
|  2|-3.0|
|  2|-1.0|
|  2| 4.0|
+---+----+



In [28]:
@pandas_udf("double", functionType=PandasUDFType.GROUPED_AGG)
def mean(data: pd.DataFrame):
  return data.mean()



In [29]:
df.groupby("id").agg(mean(col("v"))).show()

+---+-------+
| id|mean(v)|
+---+-------+
|  1|    1.5|
|  2|    6.0|
+---+-------+



In [30]:
from pyspark.sql import Window

w = Window.partitionBy("id").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [31]:
df.withColumn("mean", mean(col("v")).over(w)).show()

+---+----+----+
| id|   v|mean|
+---+----+----+
|  1| 1.0| 1.5|
|  1| 2.0| 1.5|
|  2| 3.0| 6.0|
|  2| 5.0| 6.0|
|  2|10.0| 6.0|
+---+----+----+



In [32]:
# так можно закинуть несколько колонок
@pandas_udf("double")
def mean(data: pd.Series) -> float:
    return data.mean()

In [33]:
df.groupby("id").agg(mean(col("v"))).show()

+---+-------+
| id|mean(v)|
+---+-------+
|  1|    1.5|
|  2|    6.0|
+---+-------+



In [35]:
from pyspark import pandas as spd

In [36]:
pd.DataFrame(x, columns=["x"]).to_csv("data.csv", index=False)

In [37]:
! cat data.csv

x
1
2
3


In [39]:
pdf = spd.read_csv("data.csv")



In [43]:
pdf.to_spark()


If `index_col` is not specified for `to_spark`, the existing index is lost when converting to Spark DataFrame.



DataFrame[x: int]