In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 52.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=8fed93942e41ff3cbf4ce4e004236dce200f5ee92ddaf8e86a8feff2c72b406d
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [7]:
from pyspark.sql import SparkSession, Row

In [3]:
ss = SparkSession.builder.getOrCreate()

In [5]:
!nohup ssh -o StrictHostKeyChecking=no -R mars.ru77.ru:40014:*:4040 aig@mars.ru77.ru -p 2222 &

nohup: appending output to 'nohup.out'


In [8]:
df1 = ss.createDataFrame([Row(a=i) for i in ([0] * 1000) + list(range(10))])

In [10]:
df2 = ss.createDataFrame([Row(a=i) for i in range(10)])

In [13]:
df3 = df1.join(df2, on='a')

In [15]:
res = df3.collect()

In [16]:
len(res)

1010

In [17]:
from sklearn import datasets

In [18]:
X, y = datasets.make_classification(1000)

In [19]:
df = ss.createDataFrame(
    [Row(**{str(i): float(v) for i, v in enumerate(vals)}, 
         label=int(label)) for vals, label in zip(X,y)])

In [20]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|                   0|                   1|                   2|                   3|                   4|                   5|                   6|                   7|                   8|                   9|                  10|                 11|                  12|                  13|                  14|                  15|                  16|                  17|                  18|                  19|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------

In [21]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [22]:
from pyspark.ml.feature import VectorAssembler

In [23]:
vecAss = VectorAssembler(inputCols=[str(i) for i in range(20)], outputCol="features")

In [25]:
type(vecAss)

pyspark.ml.feature.VectorAssembler

In [27]:
train_vec = vecAss.transform(train)

In [28]:
train_vec.show()

+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-----+--------------------+
|                  0|                   1|                   2|                  3|                   4|                   5|                   6|                   7|                  8|                 9|                  10|                  11|                 12|                  13|                  14|                  15|                 16|                 17|                  18|                  19|label|            features|
+-------------------+--------------------+--------------------+-------------------+--------------------+--------------

In [29]:
train_vec.printSchema()

root
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)
 |-- 13: double (nullable = true)
 |-- 14: double (nullable = true)
 |-- 15: double (nullable = true)
 |-- 16: double (nullable = true)
 |-- 17: double (nullable = true)
 |-- 18: double (nullable = true)
 |-- 19: double (nullable = true)
 |-- label: long (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
from pyspark.ml.classification import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

In [32]:
rf_model = rf.fit(train_vec)

In [33]:
test_vec = vecAss.transform(test)

In [34]:
prediction = rf_model.transform(test_vec)

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
eval = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

In [38]:
roc = eval.evaluate(prediction)

In [40]:
from pyspark.ml import Pipeline, PipelineModel

In [41]:
pipeline = Pipeline(stages=[vecAss, rf])

In [42]:
model = pipeline.fit(train)

In [43]:
preds = model.transform(test)

In [44]:
eval.evaluate(preds)

0.9414215686274511

In [45]:
model.write().overwrite().save("pipeline_model")

In [46]:
model2 = PipelineModel.load("pipeline_model/")