In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('DataFrame').getOrCreate()

spark

#### Read the dataset

In [2]:
df_spark_cars = spark.read.option('header', 'true').csv('cars.csv')

In [3]:
df_spark_cars.show()

+-----+--------------------+------+------+----------+------+
| Make|               Model|  Type|Origin|DriveTrain|Length|
+-----+--------------------+------+------+----------+------+
|Acura|                 MDX|   SUV|  Asia|       All|  4451|
|Acura|      RSX Type S 2dr| Sedan|  Asia|     Front|  2778|
|Acura|             TSX 4dr| Sedan|  Asia|     Front|  3230|
|Acura|              TL 4dr| Sedan|  Asia|     Front|  3575|
|Acura|          3.5 RL 4dr| Sedan|  Asia|     Front|  3880|
|Acura|3.5 RL w/Navigati...| Sedan|  Asia|     Front|  3893|
|Acura|NSX coupe 2dr man...|Sports|  Asia|      Rear|  3153|
| Audi|         A4 1.8T 4dr| Sedan|Europe|     Front|  3252|
| Audi|A41.8T convertibl...| Sedan|Europe|     Front|  3638|
| Audi|          A4 3.0 4dr| Sedan|Europe|     Front|  3462|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3583|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3627|
| Audi|          A6 3.0 4dr| Sedan|Europe|     Front|  3561|
| Audi|  A6 3.0 Quattro 

#### Check the schema

In [4]:
df_spark_cars.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- DriveTrain: string (nullable = true)
 |-- Length: string (nullable = true)



<font color='blue'>Note: In the above, all variables are coming up as string. If we want integer values to show as int, we need to add a check called inferSchema = 'True' in the csv function as below:</font>

In [5]:
df_spark_cars1 = spark.read.option('header', 'true').csv('cars.csv', inferSchema=True)

df_spark_cars1.printSchema()

root
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- DriveTrain: string (nullable = true)
 |-- Length: integer (nullable = true)



In [6]:
#Get the column names:

df_spark_cars.columns

['Make', 'Model', 'Type', 'Origin', 'DriveTrain', 'Length']

In [7]:
df_spark_cars.head(5)

[Row(Make='Acura', Model='MDX', Type='SUV', Origin='Asia', DriveTrain='All', Length='4451'),
 Row(Make='Acura', Model='RSX Type S 2dr', Type='Sedan', Origin='Asia', DriveTrain='Front', Length='2778'),
 Row(Make='Acura', Model='TSX 4dr', Type='Sedan', Origin='Asia', DriveTrain='Front', Length='3230'),
 Row(Make='Acura', Model='TL 4dr', Type='Sedan', Origin='Asia', DriveTrain='Front', Length='3575'),
 Row(Make='Acura', Model='3.5 RL 4dr', Type='Sedan', Origin='Asia', DriveTrain='Front', Length='3880')]

In [8]:
df_spark_cars.select('Make')

DataFrame[Make: string]

In [9]:
df_spark_cars.select(['Make', 'Type']).show()

+-----+------+
| Make|  Type|
+-----+------+
|Acura|   SUV|
|Acura| Sedan|
|Acura| Sedan|
|Acura| Sedan|
|Acura| Sedan|
|Acura| Sedan|
|Acura|Sports|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
| Audi| Sedan|
+-----+------+
only showing top 20 rows



In [10]:
df_spark_cars['Make']

Column<'Make'>

In [11]:
df_spark_cars.dtypes

[('Make', 'string'),
 ('Model', 'string'),
 ('Type', 'string'),
 ('Origin', 'string'),
 ('DriveTrain', 'string'),
 ('Length', 'string')]

In [12]:
df_spark_cars1.describe().show()

+-------+-----+----------+------+------+----------+------------------+
|summary| Make|     Model|  Type|Origin|DriveTrain|            Length|
+-------+-----+----------+------+------+----------+------------------+
|  count|  428|       428|   428|   428|       428|               428|
|   mean| null|      null|  null|  null|      null|3577.9532710280373|
| stddev| null|      null|  null|  null|      null| 758.9832146098707|
|    min|Acura|3.5 RL 4dr|Hybrid|  Asia|       All|              1850|
|    max|Volvo|        xB| Wagon|   USA|      Rear|              7190|
+-------+-----+----------+------+------+----------+------------------+



#### Adding columns in dataframe

In [13]:
df_spark_cars1.withColumn('Price', df_spark_cars1['Length']*2).show()

+-----+--------------------+------+------+----------+------+-----+
| Make|               Model|  Type|Origin|DriveTrain|Length|Price|
+-----+--------------------+------+------+----------+------+-----+
|Acura|                 MDX|   SUV|  Asia|       All|  4451| 8902|
|Acura|      RSX Type S 2dr| Sedan|  Asia|     Front|  2778| 5556|
|Acura|             TSX 4dr| Sedan|  Asia|     Front|  3230| 6460|
|Acura|              TL 4dr| Sedan|  Asia|     Front|  3575| 7150|
|Acura|          3.5 RL 4dr| Sedan|  Asia|     Front|  3880| 7760|
|Acura|3.5 RL w/Navigati...| Sedan|  Asia|     Front|  3893| 7786|
|Acura|NSX coupe 2dr man...|Sports|  Asia|      Rear|  3153| 6306|
| Audi|         A4 1.8T 4dr| Sedan|Europe|     Front|  3252| 6504|
| Audi|A41.8T convertibl...| Sedan|Europe|     Front|  3638| 7276|
| Audi|          A4 3.0 4dr| Sedan|Europe|     Front|  3462| 6924|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3583| 7166|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3627| 7

In [14]:
df_spark_cars1.show()

+-----+--------------------+------+------+----------+------+
| Make|               Model|  Type|Origin|DriveTrain|Length|
+-----+--------------------+------+------+----------+------+
|Acura|                 MDX|   SUV|  Asia|       All|  4451|
|Acura|      RSX Type S 2dr| Sedan|  Asia|     Front|  2778|
|Acura|             TSX 4dr| Sedan|  Asia|     Front|  3230|
|Acura|              TL 4dr| Sedan|  Asia|     Front|  3575|
|Acura|          3.5 RL 4dr| Sedan|  Asia|     Front|  3880|
|Acura|3.5 RL w/Navigati...| Sedan|  Asia|     Front|  3893|
|Acura|NSX coupe 2dr man...|Sports|  Asia|      Rear|  3153|
| Audi|         A4 1.8T 4dr| Sedan|Europe|     Front|  3252|
| Audi|A41.8T convertibl...| Sedan|Europe|     Front|  3638|
| Audi|          A4 3.0 4dr| Sedan|Europe|     Front|  3462|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3583|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3627|
| Audi|          A6 3.0 4dr| Sedan|Europe|     Front|  3561|
| Audi|  A6 3.0 Quattro 

In [15]:
df_spark_cars1 = df_spark_cars1.withColumn('Price', df_spark_cars1['Length']*2)

#### Drop the columns

In [17]:
df_spark_cars1 = df_spark_cars1.drop('Price')

In [18]:
df_spark_cars1.show()

+-----+--------------------+------+------+----------+------+
| Make|               Model|  Type|Origin|DriveTrain|Length|
+-----+--------------------+------+------+----------+------+
|Acura|                 MDX|   SUV|  Asia|       All|  4451|
|Acura|      RSX Type S 2dr| Sedan|  Asia|     Front|  2778|
|Acura|             TSX 4dr| Sedan|  Asia|     Front|  3230|
|Acura|              TL 4dr| Sedan|  Asia|     Front|  3575|
|Acura|          3.5 RL 4dr| Sedan|  Asia|     Front|  3880|
|Acura|3.5 RL w/Navigati...| Sedan|  Asia|     Front|  3893|
|Acura|NSX coupe 2dr man...|Sports|  Asia|      Rear|  3153|
| Audi|         A4 1.8T 4dr| Sedan|Europe|     Front|  3252|
| Audi|A41.8T convertibl...| Sedan|Europe|     Front|  3638|
| Audi|          A4 3.0 4dr| Sedan|Europe|     Front|  3462|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3583|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|  3627|
| Audi|          A6 3.0 4dr| Sedan|Europe|     Front|  3561|
| Audi|  A6 3.0 Quattro 

#### Rename columns

In [20]:
df_spark_cars1 = df_spark_cars1.withColumnRenamed('Length', 'Real Length')

In [21]:
df_spark_cars1.show()

+-----+--------------------+------+------+----------+-----------+
| Make|               Model|  Type|Origin|DriveTrain|Real Length|
+-----+--------------------+------+------+----------+-----------+
|Acura|                 MDX|   SUV|  Asia|       All|       4451|
|Acura|      RSX Type S 2dr| Sedan|  Asia|     Front|       2778|
|Acura|             TSX 4dr| Sedan|  Asia|     Front|       3230|
|Acura|              TL 4dr| Sedan|  Asia|     Front|       3575|
|Acura|          3.5 RL 4dr| Sedan|  Asia|     Front|       3880|
|Acura|3.5 RL w/Navigati...| Sedan|  Asia|     Front|       3893|
|Acura|NSX coupe 2dr man...|Sports|  Asia|      Rear|       3153|
| Audi|         A4 1.8T 4dr| Sedan|Europe|     Front|       3252|
| Audi|A41.8T convertibl...| Sedan|Europe|     Front|       3638|
| Audi|          A4 3.0 4dr| Sedan|Europe|     Front|       3462|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|       3583|
| Audi|A4 3.0 Quattro 4d...| Sedan|Europe|       All|       3627|
| Audi|   