<a href="https://colab.research.google.com/github/AdityaDhuri/PySpark/blob/main/Spark_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [4]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

#Reading a CSV


In [5]:
df = spark.read.option("header","true").csv("/content/Virat_Kohli_ODI.csv",inferSchema=True)

In [17]:
df.show(5)

+----+---+------+
|Name|Age|Gender|
+----+---+------+
|  AD| 24|     M|
|  AN| 24|     F|
+----+---+------+



#Creating a DataFrame with schema and adding values

In [13]:
schema = StructType([
    StructField("Name",StringType(),True),
    StructField("Age",IntegerType(),True),
    StructField("Gender",StringType(),True)
])

In [14]:
data = [("AD",24,"M"),
          ("AN",24,"F")]

In [15]:
df = spark.createDataFrame(data,schema=schema)

In [16]:
df.show(2)

+----+---+------+
|Name|Age|Gender|
+----+---+------+
|  AD| 24|     M|
|  AN| 24|     F|
+----+---+------+



In [18]:
df.select(["Name","Gender"]).show()

+----+------+
|Name|Gender|
+----+------+
|  AD|     M|
|  AN|     F|
+----+------+



In [19]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)



In [21]:
df.describe().show()

+-------+----+----+------+
|summary|Name| Age|Gender|
+-------+----+----+------+
|  count|   2|   2|     2|
|   mean|null|24.0|  null|
| stddev|null| 0.0|  null|
|    min|  AD|  24|     F|
|    max|  AN|  24|     M|
+-------+----+----+------+



#Adding new data to DataFrame

In [23]:
add_data = [("ADI",25,"M")]
df2 = spark.createDataFrame(add_data,schema=schema)

In [24]:
df3 = df.union(df2)
df3.show()

+----+---+------+
|Name|Age|Gender|
+----+---+------+
|  AD| 24|     M|
|  AN| 24|     F|
| ADI| 25|     M|
+----+---+------+



#Join


In [30]:
data1 = [("Alice", 25), ("Bob", 30), ("Charlie", 22)]
data2 = [("Alice", "Engineer"), ("Bob", "Manager"), ("David", "Designer")]

#if a schema is not given it will give the data types to the column on its own
df4 = spark.createDataFrame(data1, ["name","age"])
df5 = spark.createDataFrame(data2, ["name", "occupation"])


In [31]:
df4.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [32]:
#As we see above the age is taken long we can cast it into integer
df4 = df4.withColumn("age", df4["age"].cast("integer"))
df4.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [33]:
joining = df4.join(df5,on="name",how="inner")
joining.show()

+-----+---+----------+
| name|age|occupation|
+-----+---+----------+
|Alice| 25|  Engineer|
|  Bob| 30|   Manager|
+-----+---+----------+



## GroupBy

In [34]:
groupby_data = [("Alice", "Engineer", 2500),
        ("Bob", "Manager", 3500),
        ("Alice", "Designer", 2800),
        ("Bob", "Engineer", 3200),
        ("David", "Manager", 4000)]


groupby_df = spark.createDataFrame(groupby_data, ["name", "occupation", "salary"])
groupby_df.show()
groupby_df.printSchema()

+-----+----------+------+
| name|occupation|salary|
+-----+----------+------+
|Alice|  Engineer|  2500|
|  Bob|   Manager|  3500|
|Alice|  Designer|  2800|
|  Bob|  Engineer|  3200|
|David|   Manager|  4000|
+-----+----------+------+

root
 |-- name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- salary: long (nullable = true)



In [46]:
groupby_df_data = groupby_df.groupBy("name","occupation")


In [45]:
groupby_df_data_on_salary = groupby_df_data.agg({"salary":"sum"})
groupby_df_data_on_salary.show()

+-----+-----------+
| name|sum(salary)|
+-----+-----------+
|  Bob|       6700|
|Alice|       5300|
|David|       4000|
+-----+-----------+



In [51]:
groupby_df.filter((groupby_df["occupation"]=="Engineer") & (groupby_df["salary"]>=2500)).show()

+-----+----------+------+
| name|occupation|salary|
+-----+----------+------+
|Alice|  Engineer|  2500|
|  Bob|  Engineer|  3200|
+-----+----------+------+

