In [14]:
!pip install pyspark



**Creating New Column in PySpark DataFrame**

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('PySpark Column Ops').getOrCreate()

In [6]:
df = spark.read.csv('/content/Fish.csv', sep = ',', inferSchema = True, header = True)

In [7]:
df.show()

+-------+------+-------+-------+-------+-------+------+
|Species|Weight|Length1|Length2|Length3| Height| Width|
+-------+------+-------+-------+-------+-------+------+
|  Bream| 242.0|   23.2|   25.4|   30.0|  11.52|  4.02|
|  Bream| 290.0|   24.0|   26.3|   31.2|  12.48|4.3056|
|  Bream| 340.0|   23.9|   26.5|   31.1|12.3778|4.6961|
|  Bream| 363.0|   26.3|   29.0|   33.5|  12.73|4.4555|
|  Bream| 430.0|   26.5|   29.0|   34.0| 12.444| 5.134|
|  Bream| 450.0|   26.8|   29.7|   34.7|13.6024|4.9274|
|  Bream| 500.0|   26.8|   29.7|   34.5|14.1795|5.2785|
|  Bream| 390.0|   27.6|   30.0|   35.0|  12.67|  4.69|
|  Bream| 450.0|   27.6|   30.0|   35.1|14.0049|4.8438|
|  Bream| 500.0|   28.5|   30.7|   36.2|14.2266|4.9594|
|  Bream| 475.0|   28.4|   31.0|   36.2|14.2628|5.1042|
|  Bream| 500.0|   28.7|   31.0|   36.2|14.3714|4.8146|
|  Bream| 500.0|   29.1|   31.5|   36.4|13.7592| 4.368|
|  Bream| 340.0|   29.5|   32.0|   37.3|13.9129|5.0728|
|  Bream| 600.0|   29.4|   32.0|   37.2|14.9544|

**Renaming an Existing Column in PySpark DataFrame**

In [8]:
df = df.withColumn('Weight in Kg', df.Weight/1000)

In [9]:
df.show()

+-------+------+-------+-------+-------+-------+------+------------+
|Species|Weight|Length1|Length2|Length3| Height| Width|Weight in Kg|
+-------+------+-------+-------+-------+-------+------+------------+
|  Bream| 242.0|   23.2|   25.4|   30.0|  11.52|  4.02|       0.242|
|  Bream| 290.0|   24.0|   26.3|   31.2|  12.48|4.3056|        0.29|
|  Bream| 340.0|   23.9|   26.5|   31.1|12.3778|4.6961|        0.34|
|  Bream| 363.0|   26.3|   29.0|   33.5|  12.73|4.4555|       0.363|
|  Bream| 430.0|   26.5|   29.0|   34.0| 12.444| 5.134|        0.43|
|  Bream| 450.0|   26.8|   29.7|   34.7|13.6024|4.9274|        0.45|
|  Bream| 500.0|   26.8|   29.7|   34.5|14.1795|5.2785|         0.5|
|  Bream| 390.0|   27.6|   30.0|   35.0|  12.67|  4.69|        0.39|
|  Bream| 450.0|   27.6|   30.0|   35.1|14.0049|4.8438|        0.45|
|  Bream| 500.0|   28.5|   30.7|   36.2|14.2266|4.9594|         0.5|
|  Bream| 475.0|   28.4|   31.0|   36.2|14.2628|5.1042|       0.475|
|  Bream| 500.0|   28.7|   31.0|  

**Selecting One or More Columns of PySpark DataFrame**

In [10]:
df = df.withColumnRenamed("Weight in Kg", "Weight in Kilograms")

In [11]:
df.show()

+-------+------+-------+-------+-------+-------+------+-------------------+
|Species|Weight|Length1|Length2|Length3| Height| Width|Weight in Kilograms|
+-------+------+-------+-------+-------+-------+------+-------------------+
|  Bream| 242.0|   23.2|   25.4|   30.0|  11.52|  4.02|              0.242|
|  Bream| 290.0|   24.0|   26.3|   31.2|  12.48|4.3056|               0.29|
|  Bream| 340.0|   23.9|   26.5|   31.1|12.3778|4.6961|               0.34|
|  Bream| 363.0|   26.3|   29.0|   33.5|  12.73|4.4555|              0.363|
|  Bream| 430.0|   26.5|   29.0|   34.0| 12.444| 5.134|               0.43|
|  Bream| 450.0|   26.8|   29.7|   34.7|13.6024|4.9274|               0.45|
|  Bream| 500.0|   26.8|   29.7|   34.5|14.1795|5.2785|                0.5|
|  Bream| 390.0|   27.6|   30.0|   35.0|  12.67|  4.69|               0.39|
|  Bream| 450.0|   27.6|   30.0|   35.1|14.0049|4.8438|               0.45|
|  Bream| 500.0|   28.5|   30.7|   36.2|14.2266|4.9594|                0.5|
|  Bream| 47

**Creating a Column Alias in PySpark DataFrame**

In [12]:
df.select(df.Weight, df['Weight in Kilograms']).show()

+------+-------------------+
|Weight|Weight in Kilograms|
+------+-------------------+
| 242.0|              0.242|
| 290.0|               0.29|
| 340.0|               0.34|
| 363.0|              0.363|
| 430.0|               0.43|
| 450.0|               0.45|
| 500.0|                0.5|
| 390.0|               0.39|
| 450.0|               0.45|
| 500.0|                0.5|
| 475.0|              0.475|
| 500.0|                0.5|
| 500.0|                0.5|
| 340.0|               0.34|
| 600.0|                0.6|
| 600.0|                0.6|
| 700.0|                0.7|
| 700.0|                0.7|
| 610.0|               0.61|
| 650.0|               0.65|
+------+-------------------+
only showing top 20 rows



In [13]:
df.select(df['Weight in Kilograms'].alias("Kilograms")).show()

+---------+
|Kilograms|
+---------+
|    0.242|
|     0.29|
|     0.34|
|    0.363|
|     0.43|
|     0.45|
|      0.5|
|     0.39|
|     0.45|
|      0.5|
|    0.475|
|      0.5|
|      0.5|
|     0.34|
|      0.6|
|      0.6|
|      0.7|
|      0.7|
|     0.61|
|     0.65|
+---------+
only showing top 20 rows

