**PySpark is the Python API for Apache Spark, a distributed data processing framework. It is widely used for big data processing, machine learning, and real-time analytics due to its ability to scale and process large datasets efficiently.**

- PySpark installation
- Reading our Dataset
- Checking Datatypes of the columns(Schema)
- Check Describe option similar to pandas
- Adding columns
- Dropping columns
- Renaming the columns


In [1]:
! pip install pyspark



In [2]:
import pyspark

In [53]:
# Reading the file
import pandas as pd
pd.read_csv('/content/house price.csv')

Unnamed: 0,area,price
0,2600,550000
1,3000,565000
2,3200,610000
3,3600,680000
4,4000,725000


In [54]:
from pyspark.sql import SparkSession

In [55]:
spark=SparkSession.builder.appName('practice').getOrCreate()

In [56]:
spark

In [57]:
df_pyspark=spark.read.csv('/content/house price.csv')

In [58]:
df_pyspark = spark.read.option('header','true').csv('/content/house price.csv')  # header - true will get the column name as per our dataset

In [59]:
type(df_pyspark)     # it's in sql dataframe

In [60]:
df_pyspark.head(5)   # same as pandas fnction we are reading the file

[Row(area='2600', price='550000'),
 Row(area='3000', price='565000'),
 Row(area='3200', price='610000'),
 Row(area='3600', price='680000'),
 Row(area='4000', price='725000')]

In [61]:
# Checking the Datatype
df_pyspark.printSchema()

root
 |-- area: string (nullable = true)
 |-- price: string (nullable = true)



In [62]:
df_pyspark = spark.read.option('header','true').csv('/content/house price.csv',inferSchema=True)  # inferschema will correct the datatype to int or string

In [63]:
df_pyspark.printSchema()

root
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)



In [64]:
df_pyspark=spark.read.csv('/content/house price.csv',header=True,inferSchema=True)
df_pyspark.show()      # .show will show our data

+----+------+
|area| price|
+----+------+
|2600|550000|
|3000|565000|
|3200|610000|
|3600|680000|
|4000|725000|
+----+------+



In [65]:
# Checking the columns
df_pyspark.columns

['area', 'price']

In [66]:
df_pyspark.show()

+----+------+
|area| price|
+----+------+
|2600|550000|
|3000|565000|
|3200|610000|
|3600|680000|
|4000|725000|
+----+------+



In [67]:
# selecting specific column
df_pyspark.select(['price']).show()      # selecting multiple column also you can add (e.g) ['price','area','location'] everything other is same

+------+
| price|
+------+
|550000|
|565000|
|610000|
|680000|
|725000|
+------+



In [68]:
# checking the datatypes   similar to pandas
df_pyspark.dtypes

[('area', 'int'), ('price', 'int')]

In [69]:
df_pyspark.describe().show()

+-------+-----------------+-----------------+
|summary|             area|            price|
+-------+-----------------+-----------------+
|  count|                5|                5|
|   mean|           3280.0|         626000.0|
| stddev|540.3702434442519|74949.98332221295|
|    min|             2600|           550000|
|    max|             4000|           725000|
+-------+-----------------+-----------------+



In [70]:
# Adding columns in Dataframe
df_pyspark = df_pyspark.withColumn('price after 2 year',df_pyspark['price']+2)

In [72]:
df_pyspark.show()

+----+------+------------------+
|area| price|price after 2 year|
+----+------+------------------+
|2600|550000|            550002|
|3000|565000|            565002|
|3200|610000|            610002|
|3600|680000|            680002|
|4000|725000|            725002|
+----+------+------------------+



In [74]:
# Dropping the columns
df_pyspark = df_pyspark.drop('price after 2 year')

In [76]:
df_pyspark.show()

+----+------+
|area| price|
+----+------+
|2600|550000|
|3000|565000|
|3200|610000|
|3600|680000|
|4000|725000|
+----+------+



In [77]:
# Rename the columns
df_pyspark.withColumnRenamed('price','price of house').show()

+----+--------------+
|area|price of house|
+----+--------------+
|2600|        550000|
|3000|        565000|
|3200|        610000|
|3600|        680000|
|4000|        725000|
+----+--------------+

