In [1]:
# Build spark session

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

22/09/29 10:57:24 WARN Utils: Your hostname, EKTAs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.100.4 instead (on interface en0)
22/09/29 10:57:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/29 10:57:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/29 10:57:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

## read the dataframe

In [7]:
df_pyspark=spark.read.option('header','true').csv('Cricket.csv')
spark.read.option('header','true').csv('Cricket.csv').show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Virat| 34|        10|
| Rohit| 35|        13|
|Pandya| 29|         5|
| Panth| 25|         2|
| Dhoni| 40|        19|
| Rahul| 31|         6|
+------+---+----------+



## Check the datatypes of column or schema

In [8]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



Since here we can see even though age and experince are integers but it is taking as strings, since it is by default. So in order to change this

In [10]:
df_pyspark=spark.read.option('header','true').csv('Cricket.csv',inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



## Another way to read csv file in pyspark

In [11]:
df_pyspark = spark.read.csv('Cricket.csv', header=True, inferSchema=True)
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Virat| 34|        10|
| Rohit| 35|        13|
|Pandya| 29|         5|
| Panth| 25|         2|
| Dhoni| 40|        19|
| Rahul| 31|         6|
+------+---+----------+



## Selecting columns and indexing

In [13]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [14]:
df_pyspark.head(3)

[Row(Name='Virat', Age=34, Experience=10),
 Row(Name='Rohit', Age=35, Experience=13),
 Row(Name='Pandya', Age=29, Experience=5)]

In [16]:
# If i only want to see only a particular column e.g, here Name column

df_pyspark.select('Name').show()

+------+
|  Name|
+------+
| Virat|
| Rohit|
|Pandya|
| Panth|
| Dhoni|
| Rahul|
+------+



In [17]:
# Similarly for any 2 columns

df_pyspark.select(['Name','Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
| Virat|        10|
| Rohit|        13|
|Pandya|         5|
| Panth|         2|
| Dhoni|        19|
| Rahul|         6|
+------+----------+



In [18]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

## Checking describe function similar to pandas

In [21]:
df_pyspark.describe().show()

+-------+-----+------------------+-----------------+
|summary| Name|               Age|       Experience|
+-------+-----+------------------+-----------------+
|  count|    6|                 6|                6|
|   mean| null|32.333333333333336|9.166666666666666|
| stddev| null| 5.202563470700445|6.177917664283546|
|    min|Dhoni|                25|                2|
|    max|Virat|                40|               19|
+-------+-----+------------------+-----------------+



## Adding columns

In [23]:
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2).show()
df_pyspark = df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2)

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience after 2 years|
+------+---+----------+------------------------+
| Virat| 34|        10|                      12|
| Rohit| 35|        13|                      15|
|Pandya| 29|         5|                       7|
| Panth| 25|         2|                       4|
| Dhoni| 40|        19|                      21|
| Rahul| 31|         6|                       8|
+------+---+----------+------------------------+



In [24]:
df_pyspark.show()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience after 2 years|
+------+---+----------+------------------------+
| Virat| 34|        10|                      12|
| Rohit| 35|        13|                      15|
|Pandya| 29|         5|                       7|
| Panth| 25|         2|                       4|
| Dhoni| 40|        19|                      21|
| Rahul| 31|         6|                       8|
+------+---+----------+------------------------+



## Drop the columns

In [25]:
df_pyspark = df_pyspark.drop('Experience after 2 years')
df_pyspark.drop('Experience after 2 years').show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Virat| 34|        10|
| Rohit| 35|        13|
|Pandya| 29|         5|
| Panth| 25|         2|
| Dhoni| 40|        19|
| Rahul| 31|         6|
+------+---+----------+



## Renaming the columns

In [26]:
df_pyspark.withColumnRenamed('Name','Participants').show()

+------------+---+----------+
|Participants|Age|Experience|
+------------+---+----------+
|       Virat| 34|        10|
|       Rohit| 35|        13|
|      Pandya| 29|         5|
|       Panth| 25|         2|
|       Dhoni| 40|        19|
|       Rahul| 31|         6|
+------------+---+----------+

