In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

In [3]:
# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext


In [16]:
sqlContext = SQLContext(sc)
df = sqlContext.read.csv('big-mac-source-data.csv', header=True, inferSchema=True)
rdd = df.rdd

In [17]:
rdd.take(3)

[Row(name='Argentina', iso_a3='ARG', currency_code='ARS', local_price=2.5, dollar_ex=1.0, GDP_dollar=None, date='2000-04-01'),
 Row(name='Australia', iso_a3='AUS', currency_code='AUD', local_price=2.59, dollar_ex=1.68, GDP_dollar=None, date='2000-04-01'),
 Row(name='Brazil', iso_a3='BRA', currency_code='BRL', local_price=2.95, dollar_ex=1.79, GDP_dollar=None, date='2000-04-01')]

In [18]:
rdd.count()

1730

In [22]:
rdd = rdd.filter(lambda x: x.dollar_ex != 0).cache()
rdd.count()

1729

Información del registro con el precio más caro del big mac en USD

In [24]:
rdd.reduce(lambda x,y:x if x.local_price/x.dollar_ex > y.local_price/y.dollar_ex else y)

Row(name='Venezuela', iso_a3='VEN', currency_code='VEF', local_price=39.0, dollar_ex=4.29465, GDP_dollar=None, date='2013-01-01')

In [25]:
rdd.filter(lambda x: x.name != 'Venezuela').reduce(lambda x,y:x if x.local_price/x.dollar_ex > y.local_price/y.dollar_ex else y)

Row(name='Norway', iso_a3='NOR', currency_code='NOK', local_price=45.0, dollar_ex=5.41405, GDP_dollar=84443.634, date='2011-07-01')

Información del registro con el big mac más barato

In [26]:
rdd.reduce(lambda x,y:x if x.local_price/x.dollar_ex < y.local_price/y.dollar_ex else y)

Row(name='Saudi Arabia', iso_a3='SAU', currency_code='SAR', local_price=2.4, dollar_ex=3.7502, GDP_dollar=None, date='2004-05-01')

In [27]:
rdd.filter(lambda x: x.name == 'Argentina').take(2)

[Row(name='Argentina', iso_a3='ARG', currency_code='ARS', local_price=2.5, dollar_ex=1.0, GDP_dollar=None, date='2000-04-01'),
 Row(name='Argentina', iso_a3='ARG', currency_code='ARS', local_price=2.5, dollar_ex=1.0, GDP_dollar=None, date='2001-04-01')]

In [28]:
arg = rdd.filter(lambda x: x.name == 'Argentina')

In [29]:
arg_usd = arg.map(lambda x: (x.date, x.local_price/x.dollar_ex)).cache()
arg_usd.take(5)

[('2000-04-01', 2.5),
 ('2001-04-01', 2.5),
 ('2002-04-01', 0.7987220447284346),
 ('2003-04-01', 1.423611111111111),
 ('2004-05-01', 1.4779661016949153)]

Las 3 mas caras

In [30]:
arg_usd.takeOrdered(3,lambda x : -x[1])

[('2011-07-01', 4.839685420447671),
 ('2012-01-01', 4.636606004404776),
 ('2012-07-01', 4.160963591568573)]

Las 3 mas baratas

In [31]:
arg_usd.takeOrdered(3,lambda x : x[1])

[('2002-04-01', 0.7987220447284346),
 ('2003-04-01', 1.423611111111111),
 ('2004-05-01', 1.4779661016949153)]

Valor promedio de la big mac en USD

In [38]:
arg_usd.map(lambda x: x[1]).reduce(lambda x,y: x+y) / arg_usd.count()

2.9356961998694073

Los 10 países con el big mac más caro en promedio

In [41]:
rdd.map(lambda x : (x.name, (x.local_price/x.dollar_ex,1) )).reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1])).collect()
    #.map(lambda x: (x[0],x[1][0]/x[1][1])).takeOrdered(10, lambda x: -x[1])

[('Argentina', (99.81367079555984, 34)),
 ('Australia', (126.71479472728835, 34)),
 ('Brazil', (138.95534282395732, 34)),
 ('Britain', (133.42598151429138, 34)),
 ('Canada', (140.4544439741842, 34)),
 ('Chile', (113.70226265872788, 34)),
 ('China', (76.87241791306755, 34)),
 ('Czech Republic', (105.81007339102479, 34)),
 ('Denmark', (160.3082906545654, 34)),
 ('Euro area', (142.78667227132507, 34)),
 ('Hong Kong', (70.42008592123635, 34)),
 ('Hungary', (104.51019223810869, 34)),
 ('Indonesia', (73.99008459343406, 34)),
 ('Israel', (109.31898266100936, 25)),
 ('Japan', (106.29050827071148, 34)),
 ('Malaysia', (65.92139663079958, 34)),
 ('Mexico', (89.34990416497034, 34)),
 ('New Zealand', (125.87395663549559, 34)),
 ('Poland', (85.09193979453153, 34)),
 ('Russia', (67.627793283234, 34)),
 ('Singapore', (111.04009229205231, 34)),
 ('South Africa', (71.77439846711388, 34)),
 ('South Korea', (112.66408916545988, 34)),
 ('Sweden', (177.29480581673357, 34)),
 ('Switzerland', (207.78070916294

Los 10 países con el big mac más barato en promedio

In [47]:
rdd.map(lambda x : (x.name, (x.local_price/x.dollar_ex,1) )).reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))\
    .map(lambda x: (x[0],x[1][0]/x[1][1])).takeOrdered(10, lambda x: x[1])

[('Ukraine', 1.8668429846243981),
 ('Malaysia', 1.938864606788223),
 ('Russia', 1.9890527436245293),
 ('Hong Kong', 2.071178997683422),
 ('South Africa', 2.1110117196209965),
 ('Egypt', 2.1455241840623875),
 ('India', 2.157312285334809),
 ('Indonesia', 2.1761789586304134),
 ('China', 2.2609534680313987),
 ('Romania', 2.3016782219846967)]

Los 10 países con los big macs más caros históricos y la fecha

In [52]:
rdd.map(lambda x : (x.name, (x.local_price/x.dollar_ex,x.date))).reduceByKey(lambda x,y: x if x[0] > y[0] else y).takeOrdered(10, lambda x: -x[1][0])

[('Venezuela', (9.081065977437044, '2013-01-01')),
 ('Norway', (8.311707501777782, '2011-07-01')),
 ('Switzerland', (8.063015567822365, '2011-07-01')),
 ('Sweden', (7.639853516858189, '2011-07-01')),
 ('Brazil', (6.162428645563051, '2011-07-01')),
 ('Portugal', (6.001132497354401, '2012-07-01')),
 ('Denmark', (5.993858909908077, '2010-01-01')),
 ('Lebanon', (5.9523809523809526, '2020-07-01')),
 ('Finland', (5.917293500000001, '2021-01-01')),
 ('United States', (5.74, '2019-07-09'))]