In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd
import random as rd

In [3]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

# Crear datasets

In [4]:
# dataframe 1
df = pd.DataFrame({
    'ID_BASE':range(15),
    'NOMBRE': ['base'+str(i) for i in range(15)],
    'PCIA': rd.choices(['Buenos Aires','Santa Fe','Cordoba','otros'],k=15),
    'CIUDAD': ['ciudad'+str(i) for i in range(15)],
    'LAT': rd.choices(range(-40,-35),k=15),
    'LON': rd.choices(range(-70,-65),k=15)
})
bases = sqlContext.createDataFrame(df).rdd.map(tuple).cache()
bases.take(3)

[(0, 'base0', 'Santa Fe', 'ciudad0', -38, -66),
 (1, 'base1', 'Santa Fe', 'ciudad1', -39, -66),
 (2, 'base2', 'Buenos Aires', 'ciudad2', -40, -68)]

In [8]:
# dataframe 2
df2 = pd.DataFrame({
 'TIMESTAMP':pd.date_range(start='2018-01-01',freq='2H',periods=10000).strftime("%Y-%m-%d"),
 'ID_BASE': rd.choices(range(1,15),k=10000),
 'TEMPERATURA': rd.choices(range(-10,45),k=10000),
 'HUMEDAD': rd.choices(range(100),k=10000),
 'PRESIÓN': rd.choices(range(100),k=10000),
 'DIRECCIÓN VIENTO': rd.choices(range(20),k=10000),
 'VELOCIDAD VIENTO': rd.choices(['N','S','E','O'],k=10000) 
})
registros = sqlContext.createDataFrame(df2).rdd.map(tuple).cache()
registros.take(3)

[('2018-01-01', 13, 30, 62, 83, 16, 'E'),
 ('2018-01-01', 4, 25, 77, 16, 12, 'O'),
 ('2018-01-01', 3, 36, 1, 70, 19, 'S')]

Ejercicio

In [13]:
bases_bsas = bases.filter(lambda x: x[2] == 'Buenos Aires').map(lambda x: (x[0],x[1]))
bases_bsas.take(3)

[(2, 'base2'), (8, 'base8'), (9, 'base9')]

In [22]:
reg_2018 = registros.filter(lambda x: '2018' in x[0]).map(lambda x: (x[1],(x[0],x[2])))
reg_2018.take(3)

[(13, ('2018-01-01', 30)), (4, ('2018-01-01', 25)), (3, ('2018-01-01', 36))]

In [42]:
def tiene_crec_30_perc(x):
    """recibe un diccionario con {mes,temp} y devuelve si hubo un crecimiento del 30 porciento de un mes a otro"""
    for i in range(2,13):
        if (i not in x) or (i-1 not in x):
            continue
        elif ((x[i] - x[i-1])/x[i-1]) > 0.3:
            return True
    return False

In [43]:
bases_bsas.join(reg_2018)\
    .map(lambda x: ((x[0],x[1][0],x[1][1][0].split("-")[1]),(x[1][1][1],1)))\
    .reduceByKey(lambda x,y: (x[0]+x[1],x[1]+y[1]))\
    .map(lambda x: (x[0],x[1][0]/x[1][1]))\
    .map(lambda x: ((x[0][0],x[0][1]),(int(x[0][2]),x[1])))\
    .groupByKey().map(lambda x: (x[0],dict(x[1])))\
    .filter(lambda x: tiene_crec_30_perc(x[1]))\
    .map(lambda x: x[0]).take(10)

[(8, 'base8'), (9, 'base9'), (14, 'base14'), (2, 'base2')]