In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
spark = SparkSession.builder \
    .appName("practices") \
    .getOrCreate()

In [4]:
spark

In [5]:
print(spark.version)

3.5.2


In [6]:
type(pd.read_csv("Chennai house data.csv")) 

pandas.core.frame.DataFrame

# reading csv and storig the columns in columns box

In [37]:
df_spark = spark.read.option("Header","True").csv("Chennai house data.csv",inferSchema=True)
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [38]:
df_spark.show(5)

+-----+----+------------------+---+--------+----+----------+--------------------+
|price|area|            status|bhk|bathroom| age|  location|             builder|
+-----+----+------------------+---+--------+----+----------+--------------------+
|37.49| 872|     Ready to move|  2|    NULL|   1| Sembakkam|       MP Developers|
|93.54|1346|Under Construction|  3|       2|NULL|  Selaiyur|       DAC Promoters|
|151.0|2225|Under Construction|  3|    NULL|   0| Mogappair|Casagrand Builder...|
| 49.0|1028|     Ready to move|  2|       2|   3|  Ambattur|Dugar Housing Bui...|
|42.28| 588|Under Construction|  2|       1|   0|Pallavaram|Radiance Realty D...|
+-----+----+------------------+---+--------+----+----------+--------------------+
only showing top 5 rows



In [39]:
df_spark.printSchema()    #like df.info()

root
 |-- price: double (nullable = true)
 |-- area: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- bhk: integer (nullable = true)
 |-- bathroom: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- builder: string (nullable = true)



# Another way to read the dataset

In [40]:
df_spark = spark.read.csv("Chennai house data.csv",header = True, inferSchema = True)

In [41]:
df_spark.show(5)

+-----+----+------------------+---+--------+----+----------+--------------------+
|price|area|            status|bhk|bathroom| age|  location|             builder|
+-----+----+------------------+---+--------+----+----------+--------------------+
|37.49| 872|     Ready to move|  2|    NULL|   1| Sembakkam|       MP Developers|
|93.54|1346|Under Construction|  3|       2|NULL|  Selaiyur|       DAC Promoters|
|151.0|2225|Under Construction|  3|    NULL|   0| Mogappair|Casagrand Builder...|
| 49.0|1028|     Ready to move|  2|       2|   3|  Ambattur|Dugar Housing Bui...|
|42.28| 588|Under Construction|  2|       1|   0|Pallavaram|Radiance Realty D...|
+-----+----+------------------+---+--------+----+----------+--------------------+
only showing top 5 rows



In [42]:
df_spark.printSchema()    

root
 |-- price: double (nullable = true)
 |-- area: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- bhk: integer (nullable = true)
 |-- bathroom: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- builder: string (nullable = true)



# what is dataframe ?
dataframe is one kind of the data structure, we can perform various type of operation in the dataframe

In [43]:
df_spark.columns

['price', 'area', 'status', 'bhk', 'bathroom', 'age', 'location', 'builder']

In [44]:
df_spark.select("price","area").show(5)           #select is used for the specify columns to select
df_spark.select("location").show(5)

+-----+----+
|price|area|
+-----+----+
|37.49| 872|
|93.54|1346|
|151.0|2225|
| 49.0|1028|
|42.28| 588|
+-----+----+
only showing top 5 rows

+----------+
|  location|
+----------+
| Sembakkam|
|  Selaiyur|
| Mogappair|
|  Ambattur|
|Pallavaram|
+----------+
only showing top 5 rows



In [45]:
df_spark.dtypes

[('price', 'double'),
 ('area', 'int'),
 ('status', 'string'),
 ('bhk', 'int'),
 ('bathroom', 'int'),
 ('age', 'int'),
 ('location', 'string'),
 ('builder', 'string')]

In [46]:
df_spark.describe().show(5)

+-------+------------------+------------------+------------------+------------------+------------------+------------------+-------------+------------+
|summary|             price|              area|            status|               bhk|          bathroom|               age|     location|     builder|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+-------------+------------+
|  count|              2620|              2620|              2620|              2620|              1403|              1729|         2620|        2620|
|   mean| 93.83468320610666|1282.9251908396946|              NULL|2.4438931297709923| 2.359230220955096|1.3551185656448814|         NULL|        NULL|
| stddev|113.60934931337222| 692.5663187698266|              NULL|0.8119836251037046|0.8449514283614394|2.1026819845567295|         NULL|        NULL|
|    min|             12.83|               300|     Ready to move|                 1|         

# adding the columns

In [47]:
# using withColumns

dummy = df_spark.withColumn("Age after two year",df_spark["age"]+2)
dummy.show(5)

+-----+----+------------------+---+--------+----+----------+--------------------+------------------+
|price|area|            status|bhk|bathroom| age|  location|             builder|Age after two year|
+-----+----+------------------+---+--------+----+----------+--------------------+------------------+
|37.49| 872|     Ready to move|  2|    NULL|   1| Sembakkam|       MP Developers|                 3|
|93.54|1346|Under Construction|  3|       2|NULL|  Selaiyur|       DAC Promoters|              NULL|
|151.0|2225|Under Construction|  3|    NULL|   0| Mogappair|Casagrand Builder...|                 2|
| 49.0|1028|     Ready to move|  2|       2|   3|  Ambattur|Dugar Housing Bui...|                 5|
|42.28| 588|Under Construction|  2|       1|   0|Pallavaram|Radiance Realty D...|                 2|
+-----+----+------------------+---+--------+----+----------+--------------------+------------------+
only showing top 5 rows



# droping column

In [48]:
df_spark = df_spark.drop("Age after two year","bathroom")
df_spark.show(5)

+-----+----+------------------+---+----+----------+--------------------+
|price|area|            status|bhk| age|  location|             builder|
+-----+----+------------------+---+----+----------+--------------------+
|37.49| 872|     Ready to move|  2|   1| Sembakkam|       MP Developers|
|93.54|1346|Under Construction|  3|NULL|  Selaiyur|       DAC Promoters|
|151.0|2225|Under Construction|  3|   0| Mogappair|Casagrand Builder...|
| 49.0|1028|     Ready to move|  2|   3|  Ambattur|Dugar Housing Bui...|
|42.28| 588|Under Construction|  2|   0|Pallavaram|Radiance Realty D...|
+-----+----+------------------+---+----+----------+--------------------+
only showing top 5 rows



# renaming the columns

In [49]:
df_spark.withColumnRenamed("price","Price").show(5)

+-----+----+------------------+---+----+----------+--------------------+
|Price|area|            status|bhk| age|  location|             builder|
+-----+----+------------------+---+----+----------+--------------------+
|37.49| 872|     Ready to move|  2|   1| Sembakkam|       MP Developers|
|93.54|1346|Under Construction|  3|NULL|  Selaiyur|       DAC Promoters|
|151.0|2225|Under Construction|  3|   0| Mogappair|Casagrand Builder...|
| 49.0|1028|     Ready to move|  2|   3|  Ambattur|Dugar Housing Bui...|
|42.28| 588|Under Construction|  2|   0|Pallavaram|Radiance Realty D...|
+-----+----+------------------+---+----+----------+--------------------+
only showing top 5 rows



# Handling missing value

In [50]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ["age"],
    outputCols = ["age"]).setStrategy("mean")


df = imputer.fit(df_spark).transform(df_spark)
df.show(15)

+-----+----+------------------+---+---+-------------+--------------------+
|price|area|            status|bhk|age|     location|             builder|
+-----+----+------------------+---+---+-------------+--------------------+
|37.49| 872|     Ready to move|  2|  1|    Sembakkam|       MP Developers|
|93.54|1346|Under Construction|  3|  1|     Selaiyur|       DAC Promoters|
|151.0|2225|Under Construction|  3|  0|    Mogappair|Casagrand Builder...|
| 49.0|1028|     Ready to move|  2|  3|     Ambattur|Dugar Housing Bui...|
|42.28| 588|Under Construction|  2|  0|   Pallavaram|Radiance Realty D...|
|188.0|2221|Under Construction|  3|  0|Virugambakkam|Traventure Homes ...|
| 38.0| 885|Under Construction|  3|  0|Thirumazhisai|           Urbanrise|
|72.99| 936|     Ready to move|  3|  6|   Moolakadai|Navin Housing Pro...|
|125.0|2275|     Ready to move|  4|  1|Ottiyambakkam|Jones foundation ...|
|24.56| 622|Under Construction|  2|  0|Perungalathur|          Isha Homes|
|67.87|1305|Under Constru

In [52]:
df.describe().show(5)

+-------+------------------+------------------+------------------+------------------+------------------+-------------+------------+
|summary|             price|              area|            status|               bhk|               age|     location|     builder|
+-------+------------------+------------------+------------------+------------------+------------------+-------------+------------+
|  count|              2620|              2620|              2620|              2620|              2620|         2620|        2620|
|   mean| 93.83468320610666|1282.9251908396946|              NULL|2.4438931297709923| 1.234351145038168|         NULL|        NULL|
| stddev|113.60934931337222| 692.5663187698266|              NULL|0.8119836251037046|1.7162285383748437|         NULL|        NULL|
|    min|             12.83|               300|     Ready to move|                 1|                 0|    Adambakam|24K Realtors|
|    max|            1422.0|              6700|Under Construction|          