# Pyspark session

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [4]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

## Read data frame

In [10]:

df = spark.read.csv("base_exp.csv", header= True,inferSchema= True)

## Check schema

In [26]:
df = df.withColumn("age",f.col("age").cast("Integer"))

In [27]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



## Show 

In [12]:
df.show()

+------+----+----------+
|  name| age|experience|
+------+----+----------+
| Roger|24.0|         2|
| Marie|40.0|        10|
|Fatima|47.0|         7|
|  Alex|26.0|         3|
| sunny|24.0|         2|
+------+----+----------+



In [15]:
df.head(2)

[Row(name='Roger', age=24.0, experience=2),
 Row(name='Marie', age=40.0, experience=10)]

In [13]:
df.columns

['name', 'age', 'experience']

In [20]:
df.select("age","Experience").show()

+----+----------+
| age|Experience|
+----+----------+
|24.0|         2|
|40.0|        10|
|47.0|         7|
|26.0|         3|
|24.0|         2|
+----+----------+



## Description

In [31]:
df.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [30]:
df.describe().show()

+-------+-----+------------------+------------------+
|summary| name|               age|        experience|
+-------+-----+------------------+------------------+
|  count|    5|                 5|                 5|
|   mean| NULL|              32.2|               4.8|
| stddev| NULL|10.639548862616309|3.5637059362410923|
|    min| Alex|                24|                 2|
|    max|sunny|                47|                10|
+-------+-----+------------------+------------------+



## Adding columns in dataframe

In [32]:
df = df.withColumn('Exp_after_2y',f.col("Experience")+2)

In [34]:
df.show()

+------+---+----------+------------+
|  name|age|experience|Exp_after_2y|
+------+---+----------+------------+
| Roger| 24|         2|           4|
| Marie| 40|        10|          12|
|Fatima| 47|         7|           9|
|  Alex| 26|         3|           5|
| sunny| 24|         2|           4|
+------+---+----------+------------+



## Drop columns

In [35]:
df = df.drop("Exp_after_2y")

## Rename column

In [36]:
df = df.withColumnRenamed("name","first_name")

In [37]:
df.show()

+----------+---+----------+
|first_name|age|experience|
+----------+---+----------+
|     Roger| 24|         2|
|     Marie| 40|        10|
|    Fatima| 47|         7|
|      Alex| 26|         3|
|     sunny| 24|         2|
+----------+---+----------+

