<a href="https://colab.research.google.com/github/Arun-Alpy/PySpark/blob/main/pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linux Basic

In [15]:
!git version

git version 2.34.1


In [None]:
print ("PySpark")

PySpark


In [16]:
!whoami

root


In [17]:
!pwd

/content


# PySpark Basics

In [18]:
!pip install pyspark



In [19]:
!pip show pyspark

Name: pyspark
Version: 3.5.1
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: py4j
Required-by: dataproc-spark-connect


In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Basics').getOrCreate()

In [25]:
#create dataframe
data=[("Hello","World")]
columns=["Word1","Word2"]
df=spark.createDataFrame(data,columns)

In [27]:
df.show()

+-----+-----+
|Word1|Word2|
+-----+-----+
|Hello|World|
+-----+-----+



## Basic Transformations and Actions

In [3]:
columns=["Name","Department","Salary"]
data = [
    ("John", "Sales", 3000),
    ("Jane", "Finance", 4000),
    ("Mike", "Sales", 3500),
    ("Alice", "Finance", 3800),
    ("Bob", "IT", 4500)
]
df = spark.createDataFrame(data, columns)
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [29]:
#filter:employee with salary >3500
df_filtered=df.filter(df.Salary>3500)
df_filtered.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| Jane|   Finance|  4000|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [31]:
#group by and aggregate: Avg salary by Department
df_grouped=df.groupBy("Department").agg({"Salary":"avg"})
df_grouped.show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [32]:
#group by and aggregate: Avg salary by Department
df_grouped=df.groupBy("Department").avg("Salary")
df_grouped.show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [33]:
#Add a new column: Salary with bonus (10%)
from pyspark.sql.functions import col
exp=col("Salary")*1.1
df_with_bonus=df.withColumn("Salary_10%_bonus",exp)
df_with_bonus.show()

+-----+----------+------+------------------+
| Name|Department|Salary|  Salary_10%_bonus|
+-----+----------+------+------------------+
| John|     Sales|  3000|3300.0000000000005|
| Jane|   Finance|  4000|            4400.0|
| Mike|     Sales|  3500|3850.0000000000005|
|Alice|   Finance|  3800|            4180.0|
|  Bob|        IT|  4500|            4950.0|
+-----+----------+------+------------------+



In [34]:

from pyspark.sql.functions import col,upper,lower,concat_ws,length,when

df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [35]:
#Change case transformation
df_upper=df.withColumn("Name_upper",upper(col("Name")))
df_lower=df.withColumn("Name_lower",lower(col("Name")))
df_upper.show()
df_lower.show()

+-----+----------+------+----------+
| Name|Department|Salary|Name_upper|
+-----+----------+------+----------+
| John|     Sales|  3000|      JOHN|
| Jane|   Finance|  4000|      JANE|
| Mike|     Sales|  3500|      MIKE|
|Alice|   Finance|  3800|     ALICE|
|  Bob|        IT|  4500|       BOB|
+-----+----------+------+----------+

+-----+----------+------+----------+
| Name|Department|Salary|Name_lower|
+-----+----------+------+----------+
| John|     Sales|  3000|      john|
| Jane|   Finance|  4000|      jane|
| Mike|     Sales|  3500|      mike|
|Alice|   Finance|  3800|     alice|
|  Bob|        IT|  4500|       bob|
+-----+----------+------+----------+



In [36]:
#concatenate columns
df_concat=df.withColumn("Full_Info",concat_ws(",","Name","Department"))
df_concat.show()

+-----+----------+------+-------------+
| Name|Department|Salary|    Full_Info|
+-----+----------+------+-------------+
| John|     Sales|  3000|   John,Sales|
| Jane|   Finance|  4000| Jane,Finance|
| Mike|     Sales|  3500|   Mike,Sales|
|Alice|   Finance|  3800|Alice,Finance|
|  Bob|        IT|  4500|       Bob,IT|
+-----+----------+------+-------------+



In [37]:
# string length of names in new DF
df_length=df.withColumn("Name_length",length(col("Name")))
df_length.show()

+-----+----------+------+-----------+
| Name|Department|Salary|Name_length|
+-----+----------+------+-----------+
| John|     Sales|  3000|          4|
| Jane|   Finance|  4000|          4|
| Mike|     Sales|  3500|          4|
|Alice|   Finance|  3800|          5|
|  Bob|        IT|  4500|          3|
+-----+----------+------+-----------+



In [38]:
#conditional Columns(Salary category)
df_conditional=df.withColumn("Salary_Category",
                              when(col("Salary")>3500,"High")
                              .when(col("Salary")>3000,"Medium")
                              .otherwise("Low"))
df_conditional.show()

+-----+----------+------+---------------+
| Name|Department|Salary|Salary_Category|
+-----+----------+------+---------------+
| John|     Sales|  3000|            Low|
| Jane|   Finance|  4000|           High|
| Mike|     Sales|  3500|         Medium|
|Alice|   Finance|  3800|           High|
|  Bob|        IT|  4500|           High|
+-----+----------+------+---------------+



In [39]:
# rename column (Salary to Base_Salary)
df_renamed=df.withColumnRenamed("Salary","Base_Salary")
df_renamed.show()

+-----+----------+-----------+
| Name|Department|Base_Salary|
+-----+----------+-----------+
| John|     Sales|       3000|
| Jane|   Finance|       4000|
| Mike|     Sales|       3500|
|Alice|   Finance|       3800|
|  Bob|        IT|       4500|
+-----+----------+-----------+



In [4]:
#group by department and count employees

df.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|    2|
|   Finance|    2|
|        IT|    1|
+----------+-----+



In [5]:
df.groupBy("Department").agg({"Salary":"sum"}).show()

+----------+-----------+
|Department|sum(Salary)|
+----------+-----------+
|     Sales|       6500|
|   Finance|       7800|
|        IT|       4500|
+----------+-----------+



In [7]:
#group by department and calculate multiple Aggregations
from pyspark.sql.functions import avg, max, min
df.groupBy("Department").agg(
    avg("Salary").alias("avg_salary"),
    max("Salary").alias("max_salary"),
    min("Salary").alias("min_salary")
).show()

+----------+----------+----------+----------+
|Department|avg_salary|max_salary|min_salary|
+----------+----------+----------+----------+
|     Sales|    3250.0|      3500|      3000|
|   Finance|    3900.0|      4000|      3800|
|        IT|    4500.0|      4500|      4500|
+----------+----------+----------+----------+



In [8]:
# Create another DataFrame for department info
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C")
]
dept_columns = ["Department", "Location"]

In [9]:
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [10]:
dept_df=spark.createDataFrame(dept_data,dept_columns)
dept_df.show()


+----------+----------+
|Department|  Location|
+----------+----------+
|     Sales|Building A|
|   Finance|Building B|
|        IT|Building C|
+----------+----------+



In [11]:
#join employees with department info

joined_df=df.join(dept_df,on="Department",how="inner")
joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|   Finance| Jane|  4000|Building B|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
|     Sales| John|  3000|Building A|
|     Sales| Mike|  3500|Building A|
+----------+-----+------+----------+



In [12]:
# Employee DataFrame
emp_data = [
    (1, "John", "Sales", 3000),
    (2, "Jane", "Finance", 4000),
    (3, "Mike", "Sales", 3500),
    (4, "Alice", "HR", 3800),
    (5, "Bob", "IT", 4500),
    (6, "Sam", "Support", 3200)
]
emp_cols = ["EmpID", "Name", "Department", "Salary"]
emp_df = spark.createDataFrame(emp_data, emp_cols)

# Department DataFrame
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C"),
    ("Admin", "Building D")
]
dept_cols = ["Department", "Location"]
dept_df = spark.createDataFrame(dept_data, dept_cols)

# Display both
emp_df.show()
dept_df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1| John|     Sales|  3000|
|    2| Jane|   Finance|  4000|
|    3| Mike|     Sales|  3500|
|    4|Alice|        HR|  3800|
|    5|  Bob|        IT|  4500|
|    6|  Sam|   Support|  3200|
+-----+-----+----------+------+

+----------+----------+
|Department|  Location|
+----------+----------+
|     Sales|Building A|
|   Finance|Building B|
|        IT|Building C|
|     Admin|Building D|
+----------+----------+

