<a href="https://colab.research.google.com/github/BhanuSaketh/PySpark/blob/main/PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Loading**

In [2]:
import pyspark

In [3]:
import pandas as pd
df=pd.read_csv('test1.csv')
type(df)

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark=SparkSession.builder.appName("Practise").getOrCreate()

In [6]:
spark

In [36]:
df_spark=spark.read.option('header','true').csv("test1.csv",inferSchema=True)
type(df_spark)

In [8]:
df_spark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|    Ram|  30|        10| 30000|
|   Sita|  31|         8| 25000|
|Hanuman|  29|         4| 20000|
|  Bhanu|  24|         3| 20000|
| Saketh|  21|         1| 15000|
|   Hari|  23|         2| 18000|
|  Sunny|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [9]:
df_spark.head(3)

[Row(Name='Ram', Age=30, Experience=10, Salary=30000),
 Row(Name='Sita', Age=31, Experience=8, Salary=25000),
 Row(Name='Hanuman', Age=29, Experience=4, Salary=20000)]

In [10]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [12]:
df_spark.select("Name","Experience").show()

+-------+----------+
|   Name|Experience|
+-------+----------+
|    Ram|        10|
|   Sita|         8|
|Hanuman|         4|
|  Bhanu|         3|
| Saketh|         1|
|   Hari|         2|
|  Sunny|      NULL|
|   NULL|        10|
|   NULL|      NULL|
+-------+----------+



In [13]:
df_spark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

**Data Preprocessing**

In [14]:
df_spark.describe().show()

+-------+-----+------------------+------------------+-----------------+
|summary| Name|               Age|        Experience|           Salary|
+-------+-----+------------------+------------------+-----------------+
|  count|    7|                 8|                 7|                8|
|   mean| NULL|              28.5| 5.428571428571429|          25750.0|
| stddev| NULL|5.3718844791323335|3.8234863173611093|9361.776388210581|
|    min|Bhanu|                21|                 1|            15000|
|    max|Sunny|                36|                10|            40000|
+-------+-----+------------------+------------------+-----------------+



In [15]:
df_spark=df_spark.withColumn("Experience After 2 years",df_spark["Experience"]+2)

In [16]:
df_spark.show()

+-------+----+----------+------+------------------------+
|   Name| Age|Experience|Salary|Experience After 2 years|
+-------+----+----------+------+------------------------+
|    Ram|  30|        10| 30000|                      12|
|   Sita|  31|         8| 25000|                      10|
|Hanuman|  29|         4| 20000|                       6|
|  Bhanu|  24|         3| 20000|                       5|
| Saketh|  21|         1| 15000|                       3|
|   Hari|  23|         2| 18000|                       4|
|  Sunny|NULL|      NULL| 40000|                    NULL|
|   NULL|  34|        10| 38000|                      12|
|   NULL|  36|      NULL|  NULL|                    NULL|
+-------+----+----------+------+------------------------+



In [17]:
df_spark=df_spark.drop("Experience After 2 years")

In [18]:
df_spark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|    Ram|  30|        10| 30000|
|   Sita|  31|         8| 25000|
|Hanuman|  29|         4| 20000|
|  Bhanu|  24|         3| 20000|
| Saketh|  21|         1| 15000|
|   Hari|  23|         2| 18000|
|  Sunny|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [19]:
df_spark.withColumnRenamed("Name","New Name").show()

+--------+----+----------+------+
|New Name| Age|Experience|Salary|
+--------+----+----------+------+
|     Ram|  30|        10| 30000|
|    Sita|  31|         8| 25000|
| Hanuman|  29|         4| 20000|
|   Bhanu|  24|         3| 20000|
|  Saketh|  21|         1| 15000|
|    Hari|  23|         2| 18000|
|   Sunny|NULL|      NULL| 40000|
|    NULL|  34|        10| 38000|
|    NULL|  36|      NULL|  NULL|
+--------+----+----------+------+



In [21]:
df_spark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|    Ram|  30|        10| 30000|
|   Sita|  31|         8| 25000|
|Hanuman|  29|         4| 20000|
|  Bhanu|  24|         3| 20000|
| Saketh|  21|         1| 15000|
|   Hari|  23|         2| 18000|
|  Sunny|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [22]:
df_spark.na.drop().show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|    Ram| 30|        10| 30000|
|   Sita| 31|         8| 25000|
|Hanuman| 29|         4| 20000|
|  Bhanu| 24|         3| 20000|
| Saketh| 21|         1| 15000|
|   Hari| 23|         2| 18000|
+-------+---+----------+------+



In [29]:
## In drop we have options how,thresh,subset
## how=any,all(any says any null delete and all says if all are null then delete) and thresh=int (atleast int no of non-null should present)
## subset says delete form specified column if null present in that column
df_spark.na.drop(how='any',thresh=2).show()
df_spark.na.drop(how='any',subset=['Age']).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|    Ram|  30|        10| 30000|
|   Sita|  31|         8| 25000|
|Hanuman|  29|         4| 20000|
|  Bhanu|  24|         3| 20000|
| Saketh|  21|         1| 15000|
|   Hari|  23|         2| 18000|
|  Sunny|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
+-------+----+----------+------+

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|    Ram| 30|        10| 30000|
|   Sita| 31|         8| 25000|
|Hanuman| 29|         4| 20000|
|  Bhanu| 24|         3| 20000|
| Saketh| 21|         1| 15000|
|   Hari| 23|         2| 18000|
|   NULL| 34|        10| 38000|
|   NULL| 36|      NULL|  NULL|
+-------+---+----------+------+



In [46]:
df_spark.na.fill("Missing Values").show()
df_spark.na.fill(-1).show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|           Ram|  30|        10| 30000|
|          Sita|  31|         8| 25000|
|       Hanuman|  29|         4| 20000|
|         Bhanu|  24|         3| 20000|
|        Saketh|  21|         1| 15000|
|          Hari|  23|         2| 18000|
|         Sunny|NULL|      NULL| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      NULL|  NULL|
+--------------+----+----------+------+

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|    Ram| 30|        10| 30000|
|   Sita| 31|         8| 25000|
|Hanuman| 29|         4| 20000|
|  Bhanu| 24|         3| 20000|
| Saketh| 21|         1| 15000|
|   Hari| 23|         2| 18000|
|  Sunny| -1|        -1| 40000|
|   NULL| 34|        10| 38000|
|   NULL| 36|        -1|    -1|
+-------+---+----------+------+



In [47]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age', 'Experience', 'Salary'],
                 outputCols=["{}_imputed".format(c) for c in ['Age','Experience', 'Salary']]
                 ).setStrategy('mean')

In [48]:
## ADD imputation cols to df
imputer.fit(df_spark).transform(df_spark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|    Ram|  30|        10| 30000|         30|                10|         30000|
|   Sita|  31|         8| 25000|         31|                 8|         25000|
|Hanuman|  29|         4| 20000|         29|                 4|         20000|
|  Bhanu|  24|         3| 20000|         24|                 3|         20000|
| Saketh|  21|         1| 15000|         21|                 1|         15000|
|   Hari|  23|         2| 18000|         23|                 2|         18000|
|  Sunny|NULL|      NULL| 40000|         28|                 5|         40000|
|   NULL|  34|        10| 38000|         34|                10|         38000|
|   NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+-------+----+----------+------+-----------+--------

In [49]:
df_spark.filter("Salary<=20000").show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|Hanuman| 29|         4| 20000|
|  Bhanu| 24|         3| 20000|
| Saketh| 21|         1| 15000|
|   Hari| 23|         2| 18000|
+-------+---+----------+------+



In [50]:
df_spark.filter("Salary<=20000").select(["Name","Age"]).show()

+-------+---+
|   Name|Age|
+-------+---+
|Hanuman| 29|
|  Bhanu| 24|
| Saketh| 21|
|   Hari| 23|
+-------+---+



In [52]:
df_spark.filter((df_spark["Salary"]<=20000) & (df_spark["Salary"]>=15000)).select(["Name","Age"]).show()

+-------+---+
|   Name|Age|
+-------+---+
|Hanuman| 29|
|  Bhanu| 24|
| Saketh| 21|
|   Hari| 23|
+-------+---+



In [53]:
df_spark.filter(~(df_spark["Salary"]<=20000)).show()

+-----+----+----------+------+
| Name| Age|Experience|Salary|
+-----+----+----------+------+
|  Ram|  30|        10| 30000|
| Sita|  31|         8| 25000|
|Sunny|NULL|      NULL| 40000|
| NULL|  34|        10| 38000|
+-----+----+----------+------+

