<a href="https://colab.research.google.com/github/Alby-Benny-IBM/PySpark/blob/main/01_Greeting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Linux Basic***

In [2]:
!cat /etc/os-release

PRETTY_NAME="Ubuntu 22.04.4 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.4 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy


In [3]:
!uname

Linux


In [4]:
!pwd

/content


In [5]:
!ls /

bin			    kaggle		      opt		 sys
boot			    lib			      proc		 tmp
content			    lib32		      python-apt	 tools
cuda-keyring_1.1-1_all.deb  lib64		      python-apt.tar.xz  usr
datalab			    libx32		      root		 var
dev			    media		      run
etc			    mnt			      sbin
home			    NGC-DL-CONTAINER-LICENSE  srv


# Pyspark Basic of DataFrame

In [6]:
!pip install pyspark



In [7]:
!pip show pyspark

Name: pyspark
Version: 3.5.1
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: py4j
Required-by: dataproc-spark-connect


In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [10]:
data = [('James','Smith','1991-04-01')]
columns = ["firstname","lastname","date"]
df = spark.createDataFrame(data,columns)
df.show()

+---------+--------+----------+
|firstname|lastname|      date|
+---------+--------+----------+
|    James|   Smith|1991-04-01|
+---------+--------+----------+



# Basic Transformation and Actions

In [12]:
columns=["Name","Department", "Salary"]
data = [
    ("John", "Sales", 3000),
    ("Jane", "Finance", 4000),
    ("Mike", "Sales", 3500),
    ("Alice", "Finance", 3800),
    ("Bob", "IT", 4500)
]

In [13]:
df=spark.createDataFrame(data,columns)
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [16]:
df_filtered=df.filter(df.Salary > 3500)
df_filtered.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| Jane|   Finance|  4000|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [18]:
df_grouped=df.groupBy("Department").avg("Salary")
df_grouped.show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [21]:
from pyspark.sql.functions import col,upper,lower,concat_ws,length,when
exp=col("Salary") *1.1
df_with_bonus=df.withColumn("Salary_bonus",exp)
df_with_bonus.show()

+-----+----------+------+------------------+
| Name|Department|Salary|      Salary_bonus|
+-----+----------+------+------------------+
| John|     Sales|  3000|3300.0000000000005|
| Jane|   Finance|  4000|            4400.0|
| Mike|     Sales|  3500|3850.0000000000005|
|Alice|   Finance|  3800|            4180.0|
|  Bob|        IT|  4500|            4950.0|
+-----+----------+------+------------------+



In [22]:
df_upper=df.withColumn("Name_upper",upper(col("Name")))
df_upper.show()

+-----+----------+------+----------+
| Name|Department|Salary|Name_upper|
+-----+----------+------+----------+
| John|     Sales|  3000|      JOHN|
| Jane|   Finance|  4000|      JANE|
| Mike|     Sales|  3500|      MIKE|
|Alice|   Finance|  3800|     ALICE|
|  Bob|        IT|  4500|       BOB|
+-----+----------+------+----------+



In [23]:
df_lower=df.withColumn("Name_lower",lower(col("Name")))
df_lower.show()

+-----+----------+------+----------+
| Name|Department|Salary|Name_lower|
+-----+----------+------+----------+
| John|     Sales|  3000|      john|
| Jane|   Finance|  4000|      jane|
| Mike|     Sales|  3500|      mike|
|Alice|   Finance|  3800|     alice|
|  Bob|        IT|  4500|       bob|
+-----+----------+------+----------+



In [24]:
df_concat=df.withColumn("Name_Department",concat_ws("_",col("Name"),col("Department")))
df_concat.show()

+-----+----------+------+---------------+
| Name|Department|Salary|Name_Department|
+-----+----------+------+---------------+
| John|     Sales|  3000|     John_Sales|
| Jane|   Finance|  4000|   Jane_Finance|
| Mike|     Sales|  3500|     Mike_Sales|
|Alice|   Finance|  3800|  Alice_Finance|
|  Bob|        IT|  4500|         Bob_IT|
+-----+----------+------+---------------+



In [25]:
df_length=df.withColumn("Name_length",length(col("Name")))
df_length.show()

+-----+----------+------+-----------+
| Name|Department|Salary|Name_length|
+-----+----------+------+-----------+
| John|     Sales|  3000|          4|
| Jane|   Finance|  4000|          4|
| Mike|     Sales|  3500|          4|
|Alice|   Finance|  3800|          5|
|  Bob|        IT|  4500|          3|
+-----+----------+------+-----------+



In [27]:
df_conditional=df.withColumn("Salary_status",when(col("Salary") < 3500,"Low").when(col("Salary")>= 4000,"High").otherwise("Medium"))
df_conditional.show()

+-----+----------+------+-------------+
| Name|Department|Salary|Salary_status|
+-----+----------+------+-------------+
| John|     Sales|  3000|          Low|
| Jane|   Finance|  4000|         High|
| Mike|     Sales|  3500|       Medium|
|Alice|   Finance|  3800|       Medium|
|  Bob|        IT|  4500|         High|
+-----+----------+------+-------------+



In [28]:
df_renamed=df.withColumnRenamed("Salary","Basic_Salary")
df_renamed.show()

+-----+----------+------------+
| Name|Department|Basic_Salary|
+-----+----------+------------+
| John|     Sales|        3000|
| Jane|   Finance|        4000|
| Mike|     Sales|        3500|
|Alice|   Finance|        3800|
|  Bob|        IT|        4500|
+-----+----------+------------+

