In [34]:
import pandas as pd
from pyspark.sql import SparkSession

In [36]:
pd.read_csv("example_1.csv", delimiter=";")

Unnamed: 0,Name,Age,Experience
0,John,14,1
1,Paul,12,2
2,Bob,11,2


In [37]:
spark = SparkSession.builder.appName("example_1").getOrCreate()

In [38]:
df = spark.read.option("delimiter", ";").option("header", "true").csv("example_1.csv", inferSchema=True)
df.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 14|         1|
|Paul| 12|         2|
| Bob| 11|         2|
+----+---+----------+



In [39]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [40]:
df.head(2)

[Row(Name='John', Age=14, Experience=1),
 Row(Name='Paul', Age=12, Experience=2)]

In [41]:
df.columns

['Name', 'Age', 'Experience']

In [42]:
df.select("Name").show()

+----+
|Name|
+----+
|John|
|Paul|
| Bob|
+----+



In [43]:
df["Name"]

Column<'Name'>

In [44]:
df.describe().show()

+-------+----+------------------+------------------+
|summary|Name|               Age|        Experience|
+-------+----+------------------+------------------+
|  count|   3|                 3|                 3|
|   mean|NULL|12.333333333333334|1.6666666666666667|
| stddev|NULL|1.5275252316519468|0.5773502691896258|
|    min| Bob|                11|                 1|
|    max|Paul|                14|                 2|
+-------+----+------------------+------------------+



In [45]:
df.withColumn("Experience after 5 years", df["Experience"] + 5).show()

+----+---+----------+------------------------+
|Name|Age|Experience|Experience after 5 years|
+----+---+----------+------------------------+
|John| 14|         1|                       6|
|Paul| 12|         2|                       7|
| Bob| 11|         2|                       7|
+----+---+----------+------------------------+



In [46]:
df.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|John| 14|         1|
|Paul| 12|         2|
| Bob| 11|         2|
+----+---+----------+



In [47]:
df.drop("Experience").show()

+----+---+
|Name|Age|
+----+---+
|John| 14|
|Paul| 12|
| Bob| 11|
+----+---+



In [48]:
df.withColumnRenamed("Name", "New Name").show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|    John| 14|         1|
|    Paul| 12|         2|
|     Bob| 11|         2|
+--------+---+----------+

