<a href="https://colab.research.google.com/github/Alanontiveros/PySpark/blob/main/ejemplos_join_%26_lambda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [67]:
# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz  

In [68]:
# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

In [69]:
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [70]:
# install findspark using pip
!pip install -q findspark

In [71]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [72]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()
sqlCtx = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)
spark

#Primer dataset

In [73]:
# list  of employee data
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"], 
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"], 
        ["5", "bobby", "company 1"]]

In [74]:
# specify column names
columns = ['ID', 'NAME', 'Company']

In [75]:
# creating a dataframe from the lists of data
dataframe = spark.createDataFrame(data, columns)
  
dataframe.show()

+---+-------+---------+
| ID|   NAME|  Company|
+---+-------+---------+
|  1| sravan|company 1|
|  2| ojaswi|company 1|
|  3| rohith|company 2|
|  4|sridevi|company 1|
|  5|  bobby|company 1|
+---+-------+---------+



#Segundo dataset

In [76]:
# list of employee data
data1 = [["1", "45000", "IT"],
		["2", "145000", "Manager"],
		["6", "45000", "HR"],
		["5", "34000", "Sales"]]

In [77]:
# specify column names
columns = ['ID', 'salary', 'department']

In [78]:
# creating a dataframe from the lists of data
dataframe1 = spark.createDataFrame(data1, columns)

dataframe1.show()

+---+------+----------+
| ID|salary|department|
+---+------+----------+
|  1| 45000|        IT|
|  2|145000|   Manager|
|  6| 45000|        HR|
|  5| 34000|     Sales|
+---+------+----------+



###Sintaxis : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”inner”)

In [79]:
# inner join 
dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"inner").orderBy(dataframe.ID).show()

+---+------+---------+---+------+----------+
| ID|  NAME|  Company| ID|salary|department|
+---+------+---------+---+------+----------+
|  1|sravan|company 1|  1| 45000|        IT|
|  2|ojaswi|company 1|  2|145000|   Manager|
|  5| bobby|company 1|  5| 34000|     Sales|
+---+------+---------+---+------+----------+



#Sintaxis :

###outer : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”outer”)
###full : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”full”)
###fullouter : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”fullouter”)

In [80]:
# full outer

dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"outer").orderBy(dataframe.ID).show()

+----+-------+---------+----+------+----------+
|  ID|   NAME|  Company|  ID|salary|department|
+----+-------+---------+----+------+----------+
|null|   null|     null|   6| 45000|        HR|
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|null|  null|      null|
|   4|sridevi|company 1|null|  null|      null|
|   5|  bobby|company 1|   5| 34000|     Sales|
+----+-------+---------+----+------+----------+



In [81]:
dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"full").orderBy(dataframe.ID).show()

+----+-------+---------+----+------+----------+
|  ID|   NAME|  Company|  ID|salary|department|
+----+-------+---------+----+------+----------+
|null|   null|     null|   6| 45000|        HR|
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|null|  null|      null|
|   4|sridevi|company 1|null|  null|      null|
|   5|  bobby|company 1|   5| 34000|     Sales|
+----+-------+---------+----+------+----------+



In [82]:
dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"fullouter").orderBy(dataframe.ID).show()

+----+-------+---------+----+------+----------+
|  ID|   NAME|  Company|  ID|salary|department|
+----+-------+---------+----+------+----------+
|null|   null|     null|   6| 45000|        HR|
|   1| sravan|company 1|   1| 45000|        IT|
|   2| ojaswi|company 1|   2|145000|   Manager|
|   3| rohith|company 2|null|  null|      null|
|   4|sridevi|company 1|null|  null|      null|
|   5|  bobby|company 1|   5| 34000|     Sales|
+----+-------+---------+----+------+----------+



#Sintaxis :

###left : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”left”)
###Leftouter : dataframe1.join(dataframe2,dataframe1.column_name == dataframe2.column_name,”leftouter”)


In [83]:
#left join

dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"left").orderBy(dataframe.ID).show()

+---+-------+---------+----+------+----------+
| ID|   NAME|  Company|  ID|salary|department|
+---+-------+---------+----+------+----------+
|  1| sravan|company 1|   1| 45000|        IT|
|  2| ojaswi|company 1|   2|145000|   Manager|
|  3| rohith|company 2|null|  null|      null|
|  4|sridevi|company 1|null|  null|      null|
|  5|  bobby|company 1|   5| 34000|     Sales|
+---+-------+---------+----+------+----------+



In [84]:
#leftouter join

dataframe.join(dataframe1,dataframe.ID == dataframe1.ID,"leftouter").orderBy(dataframe.ID).show()

+---+-------+---------+----+------+----------+
| ID|   NAME|  Company|  ID|salary|department|
+---+-------+---------+----+------+----------+
|  1| sravan|company 1|   1| 45000|        IT|
|  2| ojaswi|company 1|   2|145000|   Manager|
|  3| rohith|company 2|null|  null|      null|
|  4|sridevi|company 1|null|  null|      null|
|  5|  bobby|company 1|   5| 34000|     Sales|
+---+-------+---------+----+------+----------+



#Sintaxis de una función Lambda 
##lambda argumentos: expresión


In [85]:
# Función Lambda para calcular el cuadrado de un número
square = lambda x: x ** 2
print(square(3)) # Resultado: 9

# Funcion tradicional para calcular el cuadrado de un numero
def square1(num):
  return num ** 2
print(square(5)) # Resultado: 25

9
25


In [86]:
def multiplicar_por (n):
  return lambda x: x * n
  
duplicar = multiplicar_por(2)
triplicar = multiplicar_por(3)
diez_veces = multiplicar_por(10)

print(duplicar(2))
print(triplicar(2))
print(diez_veces(2))

4
6
20


In [87]:
#obtención de numero impares en la lista
filtrado = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] if x % 2 != 0]

print(filtrado)

[1, 3, 5, 7, 9]


In [88]:
#mismo ejemplo usando filter
mi_lista = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

filtrado = filter(lambda x: x % 2 != 0, mi_lista)

list(filtrado)

[1, 3, 5, 7, 9]