In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Part I - Learning').getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.option('header','True').csv('example2.csv',inferSchema=True)

# Show functionality()

In [7]:
df.show()

+----+---+----------+
|Name|age|Experience|
+----+---+----------+
|  x1| 10|         1|
|  x2| 20|         2|
|  x3| 30|         3|
|  x4| 40|         4|
|  x5| 50|         5|
|  x6| 60|         6|
|  x7| 70|         7|
|  x8| 80|         8|
|  x9| 90|         9|
| x10|100|        10|
+----+---+----------+



In [8]:
df.show(3)

+----+---+----------+
|Name|age|Experience|
+----+---+----------+
|  x1| 10|         1|
|  x2| 20|         2|
|  x3| 30|         3|
+----+---+----------+
only showing top 3 rows



In [9]:
df.show(truncate=1)

+----+---+----------+
|Name|age|Experience|
+----+---+----------+
|   x|  1|         1|
|   x|  2|         2|
|   x|  3|         3|
|   x|  4|         4|
|   x|  5|         5|
|   x|  6|         6|
|   x|  7|         7|
|   x|  8|         8|
|   x|  9|         9|
|   x|  1|         1|
+----+---+----------+



In [10]:
df.show(3,vertical=True)

-RECORD 0---------
 Name       | x1  
 age        | 10  
 Experience | 1   
-RECORD 1---------
 Name       | x2  
 age        | 20  
 Experience | 2   
-RECORD 2---------
 Name       | x3  
 age        | 30  
 Experience | 3   
only showing top 3 rows



In [11]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



# Multiple ways to import Data

In [13]:
data = spark.read.csv('example1.csv',header=True,inferSchema=True)

In [14]:
data.show()

+----+---+
|Name|age|
+----+---+
|  x1| 10|
|  x2| 20|
|  x3| 30|
|  x4| 40|
|  x5| 50|
+----+---+



In [15]:
type(data)

pyspark.sql.dataframe.DataFrame

In [16]:
df.columns

['Name', 'age', 'Experience']

In [17]:
df[0]

Column<'Name'>

In [18]:
df.select('Name').show(5)

+----+
|Name|
+----+
|  x1|
|  x2|
|  x3|
|  x4|
|  x5|
+----+
only showing top 5 rows



In [19]:
df.select('Name').show(3)

+----+
|Name|
+----+
|  x1|
|  x2|
|  x3|
+----+
only showing top 3 rows



In [20]:
df.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [21]:
df.describe().show()

+-------+----+------------------+------------------+
|summary|Name|               age|        Experience|
+-------+----+------------------+------------------+
|  count|  10|                10|                10|
|   mean|NULL|              55.0|               5.5|
| stddev|NULL|30.276503540974915|3.0276503540974917|
|    min|  x1|                10|                 1|
|    max|  x9|               100|                10|
+-------+----+------------------+------------------+



# Add Columns

In [23]:
data_2 = df.withColumn('Experience after 3 years',df['Experience']+3)

In [24]:
data_2.show()

+----+---+----------+------------------------+
|Name|age|Experience|Experience after 3 years|
+----+---+----------+------------------------+
|  x1| 10|         1|                       4|
|  x2| 20|         2|                       5|
|  x3| 30|         3|                       6|
|  x4| 40|         4|                       7|
|  x5| 50|         5|                       8|
|  x6| 60|         6|                       9|
|  x7| 70|         7|                      10|
|  x8| 80|         8|                      11|
|  x9| 90|         9|                      12|
| x10|100|        10|                      13|
+----+---+----------+------------------------+



# Drop Columns

In [26]:
data_2 = data_2.drop('Experience after 3 years')

In [27]:
data_2.show()

+----+---+----------+
|Name|age|Experience|
+----+---+----------+
|  x1| 10|         1|
|  x2| 20|         2|
|  x3| 30|         3|
|  x4| 40|         4|
|  x5| 50|         5|
|  x6| 60|         6|
|  x7| 70|         7|
|  x8| 80|         8|
|  x9| 90|         9|
| x10|100|        10|
+----+---+----------+



In [28]:
data_2 = data_2.withColumnRenamed('age','Person_age')

In [29]:
data_2.show()

+----+----------+----------+
|Name|Person_age|Experience|
+----+----------+----------+
|  x1|        10|         1|
|  x2|        20|         2|
|  x3|        30|         3|
|  x4|        40|         4|
|  x5|        50|         5|
|  x6|        60|         6|
|  x7|        70|         7|
|  x8|        80|         8|
|  x9|        90|         9|
| x10|       100|        10|
+----+----------+----------+

