In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import ltrim,rtrim,trim,col
from pyspark.sql.functions import *
from pyspark import SparkContext
spark = SparkSession.builder.config("spark.jars", "postgresql-42.5.1.jar") \
	.master("local").appName("PySpark_Postgres_test").getOrCreate()



In [2]:
#CREATE DATAFRAME FROM CSV FILE
oij_df = spark.read.csv( path="OIJ.csv", sep=",", header=True,quote='"',inferSchema=True,)
inec_df = spark.read.csv( path="INEC.csv", sep=";", header=True,quote='"',inferSchema=True,)

In [3]:
#Function to remove spaces from the beginning and end of the string
def remove_spaces(df):
    for col in df.columns:
        if  col == 'Provincia' or col == 'Canton' or col == 'Distrito' or col == 'Provincia, cantón y distrito':
            df = df.withColumn(col, trim(col))
    return df

#Function to parser the string to lowercase
def to_lower_case(df):
    for col in df.columns:
        if  col == 'Provincia' or col == 'Canton' or col == 'Distrito' or col == 'Provincia, cantón y distrito':
            df = df.withColumn(col, lower(col))
    return df


oij_df = remove_spaces(oij_df)
inec_df = remove_spaces(inec_df)

oij_df = to_lower_case(oij_df)
inec_df = to_lower_case(inec_df)

oij_df.show(5)
inec_df.show(5)

+------+-----------+-------------------+-------------------+--------+--------------------+-------------+------+------------+----------+----------+--------+----+
|Delito|  SubDelito|              Fecha|               Hora| Victima|          SubVictima|         Edad|Genero|Nacionalidad| Provincia|    Canton|Distrito|_c12|
+------+-----------+-------------------+-------------------+--------+--------------------+-------------+------+------------+----------+----------+--------+----+
|ASALTO|ARMA BLANCA|2021-06-03 00:00:00|00:00:00 - 02:59:59|VEHICULO|SERVICIO PUBLICO/...|Mayor de edad|HOMBRE|  COSTA RICA|  san jose|alajuelita|    null|null|
|ASALTO|ARMA BLANCA|2021-06-10 00:00:00|15:00:00 - 17:59:59| PERSONA|    PEATON [PERSONA]|Mayor de edad| MUJER|  COSTA RICA|  san jose|    escazu|    null|null|
|ASALTO|ARMA BLANCA|2021-06-14 00:00:00|09:00:00 - 11:59:59| PERSONA|    PEATON [PERSONA]|Mayor de edad|HOMBRE|  COSTA RICA|guanacaste|    nicoya|    null|null|
|ASALTO|ARMA BLANCA|2021-06-14 00:

In [4]:
#Function find list the not match values in the dataframes
def find_non_matches(df1,df2):
    non_matches = []
    for col in df1.columns:
        if  col == 'Provincia' or col == 'Canton' or col == 'Distrito':
            for row in df1.select(col).distinct().collect():
                if not df2.filter(df2['Provincia, cantón y distrito'] == row[col]).collect():
                    non_matches.append(row[col])
    return non_matches

non_matches_oij = find_non_matches(oij_df,inec_df)



oij_df = remove_spaces(oij_df)
inec_df = remove_spaces(inec_df)

oij_df = to_lower_case(oij_df)
inec_df = to_lower_case(inec_df)

nonMatches = find_non_matches(oij_df,inec_df)
print(nonMatches)
#oij_df.show(5)
#inec_df.show(5)


['desconocido', 'san jose', 'limon', 'islas', 'pococi', 'rio cuarto', 'guacimo', 'belen', 'la union', 'desconocido', 'puerto jim&#201;nez', 'leon cortes', 'poas', 'tarrazu', 'san jose', 'san ramon', 'aserri', 'vasquez de coronado', 'sarch&#205;', 'paraiso', 'tilaran', 'canas', 'limon', 'perez zeledon', 'jimenez', 'monteverde', 'tibas', 'sarapiqui', 'escazu', 'santa barbara', None]


In [5]:
#Function to replace the accents in column Provincia, cantón y distrito in inec_df
def replace_accents(df):
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'á', 'a'))
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'é', 'e'))
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'í', 'i'))
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'ó', 'o'))
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'ú', 'u'))
    df = df.withColumn('Provincia, cantón y distrito', regexp_replace('Provincia, cantón y distrito', 'ñ', 'n'))
    return df

inec_df = replace_accents(inec_df)
nonMatches = find_non_matches(oij_df,inec_df)
print(nonMatches)


['desconocido', 'islas', 'desconocido', 'puerto jim&#201;nez', 'leon cortes', 'vasquez de coronado', 'sarch&#205;', 'monteverde', None]


In [6]:
#Fucntion to separate column Provincia, cantón y distrito in inec_df in three columns


In [7]:
def generate_new_columns(df):
    columns = ["Provincia1", "Canton1", "Distrito1","Tasa neta de participación", "Porcentaje de población económicamente inactiva", "Relación de dependencia económica"]
    new_df = spark.createDataFrame(data =[("","","","","","")], schema = columns)
    Provincia =''
    Canton = ''
    Distrito = ''
    counter = 0
    counter2 = 0
    for row in df.collect():
        if row['Provincia, cantón y distrito'] == None:
            counter += 1
            counter2 += 1
            if counter == 4: 
                counter = 2  

            if counter2 == 2:
                counter = 1
            continue
        if counter == 1:
            Provincia = row[0]
        if counter == 2:
            Canton = row[0]
        if counter == 3:
            Distrito = row[0]
            NewRow = (Provincia, Canton, Distrito, "", "", "")
            new_df = new_df.union(spark.createDataFrame(data =[NewRow], schema = columns))
        counter2 = 0
    
    return new_df

new_df = generate_new_columns(inec_df)

In [8]:
#Function to create a new dataframe with inec_df and oij_df when provincia, canton and distrito are equals
def join_dataframes(df1,df2):
    #df = df1.join(df2, (df1.Provincia == df2.Provincia) & (df1.Canton == df2.Canton) & (df1.Distrito == df2.Distrito), 'inner')
    df = df1.join(df2, (df1.Provincia == df2.Provincia1) & (df1.Canton == df2.Canton1), 'inner')
    return df

df = join_dataframes(oij_df,new_df)
df.show(25)


+-----------------+-----------------+-------------------+-------------------+--------+--------------------+-------------+-----------+------------+---------+--------+--------+----+----------+--------+---------+--------------------------+-----------------------------------------------+---------------------------------+
|           Delito|        SubDelito|              Fecha|               Hora| Victima|          SubVictima|         Edad|     Genero|Nacionalidad|Provincia|  Canton|Distrito|_c12|Provincia1| Canton1|Distrito1|Tasa neta de participación|Porcentaje de población económicamente inactiva|Relación de dependencia económica|
+-----------------+-----------------+-------------------+-------------------+--------+--------------------+-------------+-----------+------------+---------+--------+--------+----+----------+--------+---------+--------------------------+-----------------------------------------------+---------------------------------+
|TACHA DE VEHICULO|TACHA DE VEHICULO|2022-1

In [9]:
#Create sql table from dataframe
url = "jdbc:postgresql://localhost:5432/etl"
mode = "overwrite"
properties = {"user": "postgres", " password": "Legolas00", "driver": "org.postgresql.Driver"}

df.write.jdbc(url=url, table="test", mode=mode, properties=properties)
