In [1]:
!pip install pyspark



In [3]:
TEAM = 29

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(f"{TEAM} - spark ML").getOrCreate()

#        .master("yarn")\
#        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
#        .config("spark.sql.warehouse.dir", warehouse)\
#        .config("spark.sql.avro.compression.codec", "snappy")\
#        .enableHiveSupport()\

In [6]:
df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("records.csv")
)

In [None]:
df = df.withColumn("cmaq_oc", F.col("cmaq_organic_carbon")).drop("cmaq_organic_carbon")
for i in ["airnow_ozone", "cmaq_ozone", "cmaq_no2", "cmaq_co", "cmaq_oc", "pressure", "pbl", "temperature", "wind_speed", "wind_direction", "radiation"]:
    df = df.withColumn(i, F.col(i).cast("float"))

In [35]:
df.show()

+------------+------------+----------+--------+-------+--------+------+-----------+----------+--------------+---------+--------------+-----+---+----+-------+
|  station_id|airnow_ozone|cmaq_ozone|cmaq_no2|cmaq_co|pressure|   pbl|temperature|wind_speed|wind_direction|radiation|cloud_fraction|month|day|hour|cmaq_oc|
+------------+------------+----------+--------+-------+--------+------+-----------+----------+--------------+---------+--------------+-----+---+----+-------+
|   120350004|        35.0|      25.0|     0.0|   84.0|101384.0|1319.0|      305.0|       6.0|         242.0|    576.0|           0.0|    8|  1|  21|    1.0|
|   360850111|        52.0|      45.0|     2.0|  147.0|100797.0|1163.0|      299.0|       4.0|         189.0|    416.0|           0.0|    8|  1|  21|    2.0|
|   390610040|        41.0|      45.0|     1.0|  137.0| 98891.0|2170.0|      302.0|       6.0|         339.0|    696.0|           0.0|    8|  1|  21|    5.0|
|840450070006|        50.0|      42.0|     0.0|  154

In [36]:
df.createOrReplaceTempView("record")
df.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- airnow_ozone: float (nullable = true)
 |-- cmaq_ozone: float (nullable = true)
 |-- cmaq_no2: float (nullable = true)
 |-- cmaq_co: float (nullable = true)
 |-- pressure: float (nullable = true)
 |-- pbl: float (nullable = true)
 |-- temperature: float (nullable = true)
 |-- wind_speed: float (nullable = true)
 |-- wind_direction: float (nullable = true)
 |-- radiation: float (nullable = true)
 |-- cloud_fraction: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- cmaq_oc: float (nullable = true)



In [57]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import (
    HasInputCol,
    HasOutputCol,
    Param,
    Params,
    TypeConverters,
)
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
import math


class SinCosTransformer(
    Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable
):
    period = Param(
        Params._dummy(),
        "period",
        "Period for sin/cos",
        typeConverter=TypeConverters.toFloat,
    )

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, period=None):
        super(SinCosTransformer, self).__init__()
        # self.period = Param(self, "peirod", "")
        self._setDefault(period=1)
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, period=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setPeriod(self, value):
        return self._set(period=float(value))

    def getPeriod(self):
        return self.getOrDefault(self.period)

    def setInputCol(self, value):
        return self._set(inputCol=value)

    def setOutputCol(self, value):
        return self._set(outputCol=value)

    def _transform(self, dataset):
        period = self.getPeriod()
        inputCol = self.getInputCol()
        outputCol = self.getOutputCol()

        value = 2 * math.pi * F.col(inputCol) / period
        dataset = dataset.withColumn(f"{outputCol}_sin", F.sin(value))
        dataset = dataset.withColumn(f"{outputCol}_cos", F.cos(value))

        return dataset

In [58]:
train, test = df.randomSplit([0.7, 0.3], seed=42)

In [59]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

pipeline = Pipeline(
    stages=[
        VectorAssembler(
            inputCols=[
                "cmaq_ozone",
                "cmaq_oc",
                "pressure",
                "pbl",
                "temperature",
                "cloud_fraction",
                "wind_speed",
            ],
            outputCol="linear_features",
        ),
        StandardScaler(
            inputCol="linear_features",
            outputCol="scaled_features",
            withMean=True,
            withStd=True,
        ),
        SinCosTransformer(inputCol="month", outputCol="month", period=12),
        SinCosTransformer(inputCol="day", outputCol="day", period=31),
        SinCosTransformer(inputCol="hour", outputCol="hour", period=24),
        VectorAssembler(
            inputCols=[
                "scaled_features",
                "month_sin",
                "month_cos",
                "day_sin",
                "day_cos",
                "hour_sin",
                "hour_cos",
            ],
            outputCol="features",
        ),
    ]
)

In [61]:
pipeline = pipeline.fit(train)

train = pipeline.transform(train)
test = pipeline.transform(test)

In [66]:
train = train.select("features", "radiation")
test = test.select("features", "radiation")

In [69]:
bin = F.when(F.col("radiation") < 100, 0).otherwise(1)
train = train.withColumn("label", bin)
test = test.withColumn("label", bin)

In [74]:
from pyspark.ml.classification import RandomForestClassifier

model = RandomForestClassifier(featuresCol="features", labelCol="label")
model = model.fit(train)

In [83]:
predictions = model.transform(test)
predictions.show()

+--------------------+---------+-----+--------------------+--------------------+----------+
|            features|radiation|label|       rawPrediction|         probability|prediction|
+--------------------+---------+-----+--------------------+--------------------+----------+
|[-0.9234800013694...|      0.0|    0|[19.1970121039161...|[0.95985060519580...|       0.0|
|[-1.1718951721444...|      0.0|    0|[19.1929842380791...|[0.95964921190395...|       0.0|
|[-1.0890901152194...|      0.0|    0|[19.1521884601229...|[0.95760942300614...|       0.0|
|[-1.0062850582944...|      0.0|    0|[19.1929842380791...|[0.95964921190395...|       0.0|
|[-0.8406749444444...|      0.0|    0|[19.1970121039161...|[0.95985060519580...|       0.0|
|[-0.8406749444444...|      0.0|    0|[18.7629127616238...|[0.93814563808119...|       0.0|
|[-0.8406749444444...|      0.0|    0|[19.1970121039161...|[0.95985060519580...|       0.0|
|[-1.5031153998443...|      0.0|    0|[19.1929842380791...|[0.95964921190395...|

In [85]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9071
