In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Tokenizer, HashingTF, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier


In [25]:
spark = SparkSession.builder.appName("MLlibDemo").getOrCreate()

df = spark.read.csv('data/train.csv', header=True, inferSchema=True)

df.printSchema()

df.describe().show()

root
 |-- name: string (nullable = true)
 |-- category_list: string (nullable = true)
 |-- funding_total_usd: double (nullable = true)
 |-- status: integer (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: integer (nullable = true)
 |-- founded_at: date (nullable = true)
 |-- first_funding_at: date (nullable = true)
 |-- last_funding_at: date (nullable = true)
 |-- closed_at: date (nullable = true)





+-------+--------------------+--------------------+--------------------+-------------------+------------+------------------+--------+----------------+------------------+
|summary|                name|       category_list|   funding_total_usd|             status|country_code|        state_code|  region|            city|    funding_rounds|
+-------+--------------------+--------------------+--------------------+-------------------+------------+------------------+--------+----------------+------------------+
|  count|               52515|               50051|               42447|              52516|       47014|             45753|   46157|           46157|             52516|
|   mean|              2689.2|                 3.0|1.8247480657256044E7|0.09362860842409933|        NULL| 18.23984345881476|    NULL|            NULL| 1.740669510244497|
| stddev|  3519.5965251715993|                 0.0|1.8711730132136232E8| 0.2913141055142605|        NULL|16.644637317863157|    NULL|            NULL|

                                                                                

In [26]:
spark.version

'4.0.1'

In [7]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

df = df.fillna({
    "category_list": "Unknown",
    "funding_total_usd": 0.0,
    "country_code": "UNK",
    "state_code": "UNK",
    "region": "Unknown",
    "city": "Unknown",
})

+----+-------------+-----------------+------+------------+----------+------+----+--------------+----------+----------------+---------------+---------+
|name|category_list|funding_total_usd|status|country_code|state_code|region|city|funding_rounds|founded_at|first_funding_at|last_funding_at|closed_at|
+----+-------------+-----------------+------+------------+----------+------+----+--------------+----------+----------------+---------------+---------+
|   1|         2465|            10069|     0|        5502|      6763|  6359|6359|             0|         0|               0|              0|    47599|
+----+-------------+-----------------+------+------------+----------+------+----+--------------+----------+----------------+---------------+---------+



In [10]:
cat_cols = ["country_code", "state_code", "region"]

indexers = [
    StringIndexer(
        inputCol=c,
        outputCol=f'{c}_idx',
        handleInvalid="keep",
    )
    for c in cat_cols
]

encoder = OneHotEncoder(
    inputCols=[f'{c}_idx' for c in cat_cols],
    outputCols=[f'{c}_vec' for c in cat_cols]
)

In [12]:
tokenizer = Tokenizer(inputCol="category_list", outputCol="categories")

hasher = HashingTF(inputCol='categories', outputCol='category_vec', numFeatures=1000)

In [14]:
num_cols = ['funding_total_usd', 'funding_rounds']
assembler = VectorAssembler(inputCols=num_cols, outputCol="num_features")
scaler = StandardScaler(inputCol="num_features", outputCol="scaled_num_features")


In [19]:
stages = [
    *indexers,
    encoder,
    tokenizer,
    hasher,
    assembler,
    scaler,
    VectorAssembler(inputCols=["scaled_num_features"] + [f"{c}_vec" for c in cat_cols] + ["category_vec"],
        outputCol="features"),
    RandomForestClassifier(featuresCol="features", labelCol="status")
]
# Создание пайплайна
pipeline = Pipeline(stages=stages)


In [20]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train)

predictions = model.transform(test)

25/12/13 20:56:17 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)
25/12/13 20:56:17 WARN BlockManager: Persisting block rdd_138_0 to disk instead.
25/12/13 20:56:18 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)
25/12/13 20:56:18 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)
25/12/13 20:56:18 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)
25/12/13 20:56:19 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)
25/12/13 20:56:19 WARN MemoryStore: Not enough space to cache rdd_138_0 in memory! (computed 254.1 MiB so far)


In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='status')
print(f'Test AUC: {evaluator.evaluate(predictions)}')

Test AUC: 0.6592234276763352


In [22]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

param_grid = ParamGridBuilder() \
    .addGrid(RandomForestClassifier.maxDepth, [5, 10, 15]) \
    .addGrid(RandomForestClassifier.numTrees, [20, 35, 50]) \
    .build()

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3
)

cv_model = cv.fit(train)

25/12/13 21:05:42 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)
25/12/13 21:05:42 WARN BlockManager: Persisting block rdd_3826_0 to disk instead.
25/12/13 21:05:42 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)
25/12/13 21:05:42 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)
25/12/13 21:05:43 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)
25/12/13 21:05:43 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)
25/12/13 21:05:44 WARN MemoryStore: Not enough space to cache rdd_3826_0 in memory! (computed 254.1 MiB so far)


In [None]:
cv_model.bestModel

[np.float64(0.6486736261856394),
 np.float64(0.6486736261856395),
 np.float64(0.6486736261856395),
 np.float64(0.6486736261856395),
 np.float64(0.6486736261856395),
 np.float64(0.6486736261856395),
 np.float64(0.6486736261856394),
 np.float64(0.6486736261856394),
 np.float64(0.6486736261856395)]