In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('data_processing').getOrCreate()

In [4]:
df=spark.read.csv('iris.csv',inferSchema=True,header=True)

In [5]:
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [6]:
df.count()

150

In [7]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- class: string (nullable = true)



In [8]:
df.show(3)

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|SETOSA|
|         4.9|        3.0|         1.4|        0.2|SETOSA|
|         4.7|        3.2|         1.3|        0.2|SETOSA|
+------------+-----------+------------+-----------+------+
only showing top 3 rows



In [12]:
df.select('sepal_length','class').show(5)

+------------+------+
|sepal_length| class|
+------------+------+
|         5.1|SETOSA|
|         4.9|SETOSA|
|         4.7|SETOSA|
|         4.6|SETOSA|
|         5.0|SETOSA|
+------------+------+
only showing top 5 rows



In [13]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|    class|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   SETOSA|
|    max|               7.9|                4.4|               6.9|               2.5|VIRGINICA|
+-------+------------------+-------------------+------------------+------------------+---------+



添加新列

In [14]:
df.withColumn("petal_width_*10",df["petal_width"]*10).show(5)

+------------+-----------+------------+-----------+------+---------------+
|sepal_length|sepal_width|petal_length|petal_width| class|petal_width_*10|
+------------+-----------+------------+-----------+------+---------------+
|         5.1|        3.5|         1.4|        0.2|SETOSA|            2.0|
|         4.9|        3.0|         1.4|        0.2|SETOSA|            2.0|
|         4.7|        3.2|         1.3|        0.2|SETOSA|            2.0|
|         4.6|        3.1|         1.5|        0.2|SETOSA|            2.0|
|         5.0|        3.6|         1.4|        0.2|SETOSA|            2.0|
+------------+-----------+------------+-----------+------+---------------+
only showing top 5 rows



filter筛选

In [17]:
df.filter(df["sepal_length"]<4.5).show()

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         4.4|        2.9|         1.4|        0.2|SETOSA|
|         4.3|        3.0|         1.1|        0.1|SETOSA|
|         4.4|        3.0|         1.3|        0.2|SETOSA|
|         4.4|        3.2|         1.3|        0.2|SETOSA|
+------------+-----------+------------+-----------+------+



In [20]:
df.filter(df["sepal_length"]<4.5).filter(df["petal_length"]==1.3).show()

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         4.4|        3.0|         1.3|        0.2|SETOSA|
|         4.4|        3.2|         1.3|        0.2|SETOSA|
+------------+-----------+------------+-----------+------+



In [21]:
df.filter((df["sepal_length"]<4.5)&(df["petal_length"]==1.3)).show()

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         4.4|        3.0|         1.3|        0.2|SETOSA|
|         4.4|        3.2|         1.3|        0.2|SETOSA|
+------------+-----------+------------+-----------+------+



In [22]:
df.select("class").distinct().show()

+----------+
|     class|
+----------+
|VERSICOLOR|
| VIRGINICA|
|    SETOSA|
+----------+



分组

In [24]:
df.groupBy("class").count().show()

+----------+-----+
|     class|count|
+----------+-----+
|VERSICOLOR|   50|
| VIRGINICA|   50|
|    SETOSA|   50|
+----------+-----+



In [26]:
df.groupBy("class").mean().show()

+----------+-----------------+------------------+-----------------+------------------+
|     class|avg(sepal_length)|  avg(sepal_width)|avg(petal_length)|  avg(petal_width)|
+----------+-----------------+------------------+-----------------+------------------+
|VERSICOLOR|            5.936|2.7700000000000005|             4.26|1.3259999999999998|
| VIRGINICA|6.587999999999998|2.9739999999999998|            5.552|             2.026|
|    SETOSA|5.005999999999999|3.4180000000000006|            1.464|0.2439999999999999|
+----------+-----------------+------------------+-----------------+------------------+



In [27]:
df.groupBy("class").max().show()

+----------+-----------------+----------------+-----------------+----------------+
|     class|max(sepal_length)|max(sepal_width)|max(petal_length)|max(petal_width)|
+----------+-----------------+----------------+-----------------+----------------+
|VERSICOLOR|              7.0|             3.4|              5.1|             1.8|
| VIRGINICA|              7.9|             3.8|              6.9|             2.5|
|    SETOSA|              5.8|             4.4|              1.9|             0.6|
+----------+-----------------+----------------+-----------------+----------------+

