## Start a Spark session

In [None]:
import os
import pyspark
conf = pyspark.SparkConf()
conf.set('spark.sql.repl.eagerEval.enabled', True)
sc = pyspark.SparkContext(conf=conf)

sc.setLogLevel("ERROR")
spark = pyspark.SQLContext.getOrCreate(sc)
spark

In [2]:
sc

## Q1
What is the most popular (most sold) between the 8:00AM and 8:59AM for each day?

In [9]:
from pyspark.sql.functions import substring, col
from pyspark.sql.types import DateType

bakery = spark\
    .read.option("header",True)\
    .option("inferSchema",True)\
    .csv("shared/hw2/BreadBasket_DMS.csv")\
    .withColumn("hour",substring(col("Time"),12,2).cast("Integer"))\
    .withColumn("Date", col("Date").cast(DateType()))

bakery.printSchema()
bakery.sort("hour").limit(5)

root
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Transaction: integer (nullable = true)
 |-- Item: string (nullable = true)
 |-- hour: integer (nullable = true)



Date,Time,Transaction,Item,hour
2017-01-01,2023-03-15 01:21:05,4090,Bread,1
2016-11-01,2023-03-15 07:51:20,178,Coffee,7
2016-11-03,2023-03-15 07:50:50,341,Medialuna,7
2016-11-01,2023-03-15 07:51:20,178,Pastry,7
2016-11-03,2023-03-15 07:46:50,340,Coffee,7


In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, row_number, rank

w = Window.partitionBy("Date").orderBy(desc("count"))

bakeryq1 = bakery\
    .where("hour == 8")\
    .groupBy("Date","Item").count()\
    .withColumn("idx", row_number().over(w))

# one way; pick first item
bakeryq1.where("idx == 1").select("Date","Item").limit(10)

Date,Item
2016-10-31,Coffee
2016-11-01,Tea
2016-11-02,Coffee
2016-11-03,Coffee
2016-11-04,Bread
2016-11-05,Bread
2016-11-07,Pastry
2016-11-08,Bread
2016-11-09,Pastry
2016-11-10,Coffee


## Q2
What is the most common item bought along with “Brownie”? (items bought in the same transaction)

In [12]:
transactions = bakery\
    .where(col("Item") == "Brownie")\
    .select("Transaction")\
    .distinct()

transactions.limit(5)


Transaction
1238
3749
7554
8389
392


In [13]:
bakeryq2 = bakery.join(transactions, on="Transaction", how="inner")
bakeryq2.limit(5)

Transaction,Date,Time,Item,hour
371,2016-11-03,2023-03-15 10:39:12,Coffee,10
371,2016-11-03,2023-03-15 10:39:12,Brownie,10
371,2016-11-03,2023-03-15 10:39:12,Alfajores,10
371,2016-11-03,2023-03-15 10:39:12,Fudge,10
384,2016-11-03,2023-03-15 12:05:25,Coffee,12


In [14]:

bakeryq2\
    .where(col("Item") != "Brownie")\
    .groupBy("Item").count()\
    .orderBy(desc("count"))\
    .take(1)[0][0]
    

'Coffee'

## Q3
How many years are represented in this dataset?

In [16]:
from pyspark.sql.functions import col
from pyspark.sql.types import DateType

restaurants = spark.read\
    .option("header",True)\
    .option("delimiter",";")\
    .option("inferSchema",True)\
    .csv("shared/hw2/Restaurants_in_Durham_County_NC.csv")\
    .withColumn("Opening_Date",col("Opening_Date").cast(DateType()))\
    .withColumn("Closing_Date",col("Closing_Date").cast(DateType()))
    
restaurants.printSchema()
restaurants.limit(5)

root
 |-- ID: string (nullable = true)
 |-- Premise_Name: string (nullable = true)
 |-- Premise_Address1: string (nullable = true)
 |-- Premise_Address2: string (nullable = true)
 |-- Premise_City: string (nullable = true)
 |-- Premise_State: string (nullable = true)
 |-- Premise_Zip: string (nullable = true)
 |-- Premise_Phone: string (nullable = true)
 |-- Hours_Of_Operation: string (nullable = true)
 |-- Opening_Date: date (nullable = true)
 |-- Closing_Date: date (nullable = true)
 |-- Seats: string (nullable = true)
 |-- Water: string (nullable = true)
 |-- Sewage: string (nullable = true)
 |-- Insp_Freq: string (nullable = true)
 |-- Est_Group_Desc: string (nullable = true)
 |-- Risk: integer (nullable = true)
 |-- Smoking_Allowed: string (nullable = true)
 |-- Type_Description: string (nullable = true)
 |-- Rpt_Area_Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Transitional_Type_Desc: string (nullable = true)
 |-- geolocation: string (nullable = true)

ID,Premise_Name,Premise_Address1,Premise_Address2,Premise_City,Premise_State,Premise_Zip,Premise_Phone,Hours_Of_Operation,Opening_Date,Closing_Date,Seats,Water,Sewage,Insp_Freq,Est_Group_Desc,Risk,Smoking_Allowed,Type_Description,Rpt_Area_Desc,Status,Transitional_Type_Desc,geolocation
56060,WEST 94TH ST PUB,4711 HOPE VALLEY RD,SUITE 6C,DURHAM,NC,27707,(919) 403-0025,,1994-09-01,,60,5 - Municipal/Com...,3 - Municipal/Com...,4,Full-Service Rest...,4,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"35.9207272, -78.9..."
58123,BROOKDALE DURHAM IFS,4434 BEN FRANKLIN...,,DURHAM,NC,27704,(919) 479-9966,,2003-10-15,,350,5 - Municipal/Com...,3 - Municipal/Com...,4,Nursing Home,4,NO,16 - Institutiona...,Food Service,ACTIVE,FOOD,"36.0467802, -78.8..."
70266,SMOOTHIE KING,1125 W. NC HWY 54...,,DURHAM,NC,27707,(919) 489-7300,,2009-07-09,,7,5 - Municipal/Com...,3 - Municipal/Com...,2,Fast Food Restaurant,2,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"35.9182655, -78.9..."
97837,HAMPTON INN & SUITES,1542 N GREGSON ST,,DURHAM,NC,27701,(919) 688-8880,,2012-01-09,,100,5 - Municipal/Com...,3 - Municipal/Com...,2,Full-Service Rest...,2,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"36.0183378, -78.9..."
60690,BETTER LIVING CON...,909 GARCIA ST,,DURHAM,NC,27704,(919) 477-5825,,2008-06-02,,6,5 - Municipal/Com...,3 - Municipal/Com...,1,,0,,43 - Residential ...,Residential Care,ACTIVE,,"36.0556347, -78.9..."


In [19]:
from pyspark.sql.functions import min, max

yearsq3 = restaurants.agg(min("Opening_Date"), max("Opening_Date"), min("Closing_Date"), max("Closing_Date"))
yearsq3.printSchema()
yearsq3

root
 |-- min(Opening_Date): date (nullable = true)
 |-- max(Opening_Date): date (nullable = true)
 |-- min(Closing_Date): date (nullable = true)
 |-- max(Closing_Date): date (nullable = true)



min(Opening_Date),max(Opening_Date),min(Closing_Date),max(Closing_Date)
1990-07-01,2017-03-07,,


In [20]:
from pyspark.sql.functions import  year

yearsq3\
    .withColumn("years", year(col("max(Opening_Date)")) - year(col("min(Opening_Date)")))\
    .select("years")

years
27


## Q4
Show the type and count of restaurant opened during the 90’s (1990-1999 inclusive).

In [21]:
restaurants\
    .withColumn("year", year("Opening_Date"))\
    .where((col("year") >= 1990) & (col("year") <= 1999))\
    .groupBy("Rpt_Area_Desc").count()


Rpt_Area_Desc,count
Bed&Breakfast Home,3
Summer Camps,1
Institutions,16
Local Confinement,2
School Buildings,56
Swimming Pools,256
Day Care,58
Bed&Breakfast Inn,1
Lodging,21
Food Service,204


## Q5
For region, compute the percentage change in population, year over year. 
Note the year 1980 will not have a preceding year. 
For each year, display the region with the top population decrease.

In [22]:
pop = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .csv("shared/hw2/populationbycountry19802010millions.csv")

pop.printSchema()
pop.limit(5)

root
 |-- _c0: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (nullable = true)
 |-- 1983: string (nullable = true)
 |-- 1984: string (nullable = true)
 |-- 1985: string (nullable = true)
 |-- 1986: string (nullable = true)
 |-- 1987: string (nullable = true)
 |-- 1988: string (nullable = true)
 |-- 1989: string (nullable = true)
 |-- 1990: string (nullable = true)
 |-- 1991: string (nullable = true)
 |-- 1992: string (nullable = true)
 |-- 1993: string (nullable = true)
 |-- 1994: string (nullable = true)
 |-- 1995: string (nullable = true)
 |-- 1996: string (nullable = true)
 |-- 1997: string (nullable = true)
 |-- 1998: string (nullable = true)
 |-- 1999: string (nullable = true)
 |-- 2000: string (nullable = true)
 |-- 2001: string (nullable = true)
 |-- 2002: string (nullable = true)
 |-- 2003: string (nullable = true)
 |-- 2004: string (nullable = true)
 |-- 2005: string (nullable = true)
 |-- 2006: string (nulla

_c0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
North America,320.27638,324.44694,328.62014,332.72487,336.72143,340.74811,344.89548,349.07829,353.2939,357.68457,362.4468,367.70684,373.29069,378.74233,383.9166,388.97216,393.9428,398.97205,403.85585,408.60296,413.3245,417.83236,422.05268,426.06238,430.26938,434.47232,438.82964,443.3473,447.67394,451.83698,456.59331
Bermuda,0.05473,0.05491,0.05517,0.05551,0.05585,0.05618,0.05651,0.05683,0.05717,0.05749,0.05778,0.0581,0.0587,0.05924,0.05975,0.06029,0.06087,0.06145,0.06198,0.06251,0.06306,0.06361,0.06418,0.06476,0.06534,0.06591,0.06644,0.06692,0.06739,0.06784,0.06827
Canada,24.5933,24.9,25.2019,25.4563,25.7018,25.9416,26.2038,26.5497,26.8948,27.3793,27.7906,28.1179,28.54489,28.95334,29.33081,29.69053,30.02632,30.3056,30.55166,30.82026,31.09956,31.37674,31.64096,31.88931,32.13476,32.38638,32.65668,32.93596,33.2127,33.48721,33.75974
Greenland,0.05021,0.05103,0.05166,0.05211,0.05263,0.05315,0.05364,0.0541,0.05485,0.05541,0.05563,0.05554,0.05549,0.05564,0.05592,0.05619,0.05634,0.05651,0.05661,0.0567,0.05689,0.05713,0.05736,0.05754,0.0577,0.05778,0.05764,0.05753,0.05756,0.0576,0.05764
Mexico,68.34748,69.96926,71.6409,73.36288,75.08014,76.76723,78.44243,80.12249,81.78182,83.36684,84.91365,86.48803,88.11103,89.74914,91.3379,92.88035,94.39858,95.89515,97.32506,98.61691,99.92662,101.24696,102.47993,103.71806,104.95959,106.2029,107.44953,108.70089,109.9554,111.21179,112.46886


In [23]:
# can do this manually or in a loop

# compute percent change year over year
years = range(1981, 2011)
for year in years:
    pop = pop.withColumn("_" + str(year), (col(str(year)) - col(str(year-1))) / col(str(year-1)))
    
pop.limit(5)

_c0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,_1981,_1982,_1983,_1984,_1985,_1986,_1987,_1988,_1989,_1990,_1991,_1992,_1993,_1994,_1995,_1996,_1997,_1998,_1999,_2000,_2001,_2002,_2003,_2004,_2005,_2006,_2007,_2008,_2009,_2010
North America,320.27638,324.44694,328.62014,332.72487,336.72143,340.74811,344.89548,349.07829,353.2939,357.68457,362.4468,367.70684,373.29069,378.74233,383.9166,388.97216,393.9428,398.97205,403.85585,408.60296,413.3245,417.83236,422.05268,426.06238,430.26938,434.47232,438.82964,443.3473,447.67394,451.83698,456.59331,0.0130217532744686,0.0128625038041659,0.0124908047327836,0.0120116058652302,0.0119584904352538,0.0121713661155744,0.0121277611408533,0.0120764026889212,0.0124278115189648,0.0133140493032729,0.014512585019373888,0.015185602748102222,0.014604275290123,0.0136617156049075,0.0131683808410471,0.012778909421178122,0.0127664473116402,0.012240957731249471,0.01175446635228887,0.011555325003029847,0.0109063459823939,0.010100510166326,0.009500472784582268,0.009874140964992012,0.00976815965849118,0.010028993331496833,0.010294792302543727,0.0097590308997032,0.009299268123581119,0.0105266505632186
Bermuda,0.05473,0.05491,0.05517,0.05551,0.05585,0.05618,0.05651,0.05683,0.05717,0.05749,0.05778,0.0581,0.0587,0.05924,0.05975,0.06029,0.06087,0.06145,0.06198,0.06251,0.06306,0.06361,0.06418,0.06476,0.06534,0.06591,0.06644,0.06692,0.06739,0.06784,0.06827,0.0032888726475424,0.0047350209433617,0.0061627696211709,0.0061250225184651,0.0059086839749329,0.0058739765040939,0.005662714563794,0.0059827555868379,0.0055973412629001,0.0050443555400939,0.005538248528902749,0.010327022375215203,0.0091993185689948,0.0086090479405806,0.0090376569037657,0.009620169182285572,0.009528503367833,0.00862489829129378,0.008551145530816325,0.008798592225244102,0.0087218522042498,0.0089608552114447,0.00903708320349014,0.008956145768993163,0.008723599632690558,0.008041268396298023,0.007224563515954159,0.007023311416617,0.006677548597714...,0.0063384433962264
Canada,24.5933,24.9,25.2019,25.4563,25.7018,25.9416,26.2038,26.5497,26.8948,27.3793,27.7906,28.1179,28.54489,28.95334,29.33081,29.69053,30.02632,30.3056,30.55166,30.82026,31.09956,31.37674,31.64096,31.88931,32.13476,32.38638,32.65668,32.93596,33.2127,33.48721,33.75974,0.0124708762142534,0.0121244979919678,0.0100944770037179,0.0096439781115087,0.0093300858305644,0.0101073179757609,0.0132003755180546,0.0129982636338639,0.018014634799292,0.0150222978673669,0.011777363568976469,0.015185700212320265,0.0143090409526889,0.0130371832748829,0.012264236821281,0.011309666752328078,0.0093011731041299,0.008119291484082148,0.008791666312076098,0.009062220759980585,0.0089126662885263,0.008420887574681,0.007849003317219152,0.007696936685052193,0.007830150279634969,0.008346101046180488,0.008552002224353485,0.0084023662890043,0.008265211801509645,0.0081383310225009
Greenland,0.05021,0.05103,0.05166,0.05211,0.05263,0.05315,0.05364,0.0541,0.05485,0.05541,0.05563,0.05554,0.05549,0.05564,0.05592,0.05619,0.05634,0.05651,0.05661,0.0567,0.05689,0.05713,0.05736,0.05754,0.0577,0.05778,0.05764,0.05753,0.05756,0.0576,0.05764,0.0163314080860386,0.0123456790123456,0.0087108013937282,0.0099788908079064,0.0098802964088922,0.0092191909689557,0.0085756897837435,0.0138632162661737,0.0102096627164995,0.0039704024544305,-0.00161783210497...,-9.00252070579788...,0.0027031897639215,0.0050323508267432,0.0048283261802575,0.002669514148425...,0.0030173943911962,0.001769598301185...,0.001589825119236...,0.003350970017636...,0.0042186676041483,0.0040259058288115,0.003138075313807...,0.002780674313521...,0.001386481802426...,-0.00242298373139...,-0.00190839694656...,0.0005214670606640405,6.949270326615422E-4,0.0006944444444444162
Mexico,68.34748,69.96926,71.6409,73.36288,75.08014,76.76723,78.44243,80.12249,81.78182,83.36684,84.91365,86.48803,88.11103,89.74914,91.3379,92.88035,94.39858,95.89515,97.32506,98.61691,99.92662,101.24696,102.47993,103.71806,104.95959,106.2029,107.44953,108.70089,109.9554,111.21179,112.46886,0.0237284534850443,0.0238910630182453,0.0240362697844388,0.0234077506226581,0.0224705228306713,0.021821811207725,0.0214177454726988,0.0207099155305832,0.0193810800493312,0.0185542597032586,0.018540953074093397,0.018765602592636283,0.0185914294725643,0.0177022309071708,0.0168872943214153,0.016346083967168388,0.0158537342404939,0.01491118163953018,0.013273559759428979,0.01328078521219125,0.0132130957696758,0.0121778471175825,0.012081682725583421,0.011970239319941114,0.01184560648531491,0.011738191706629444,0.011646025813235343,0.0115409358653824,0.011426360142384968,0.0113033878872016


In [24]:
# clean up the column names; get rid of columns not needed
pop = pop.drop("1980")
for year in years:
    pop = pop.drop(str(year))
    pop = pop.withColumnRenamed("_" + str(year), str(year))

# pivot the columns as rows
pop.limit(5)

_c0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
North America,0.0130217532744686,0.0128625038041659,0.0124908047327836,0.0120116058652302,0.0119584904352538,0.0121713661155744,0.0121277611408533,0.0120764026889212,0.0124278115189648,0.0133140493032729,0.014512585019373888,0.015185602748102222,0.014604275290123,0.0136617156049075,0.0131683808410471,0.012778909421178122,0.0127664473116402,0.012240957731249471,0.01175446635228887,0.011555325003029847,0.0109063459823939,0.010100510166326,0.009500472784582268,0.009874140964992012,0.00976815965849118,0.010028993331496833,0.010294792302543727,0.0097590308997032,0.009299268123581119,0.0105266505632186
Bermuda,0.0032888726475424,0.0047350209433617,0.0061627696211709,0.0061250225184651,0.0059086839749329,0.0058739765040939,0.005662714563794,0.0059827555868379,0.0055973412629001,0.0050443555400939,0.005538248528902749,0.010327022375215203,0.0091993185689948,0.0086090479405806,0.0090376569037657,0.009620169182285572,0.009528503367833,0.00862489829129378,0.008551145530816325,0.008798592225244102,0.0087218522042498,0.0089608552114447,0.00903708320349014,0.008956145768993163,0.008723599632690558,0.008041268396298023,0.007224563515954159,0.007023311416617,0.006677548597714...,0.0063384433962264
Canada,0.0124708762142534,0.0121244979919678,0.0100944770037179,0.0096439781115087,0.0093300858305644,0.0101073179757609,0.0132003755180546,0.0129982636338639,0.018014634799292,0.0150222978673669,0.011777363568976469,0.015185700212320265,0.0143090409526889,0.0130371832748829,0.012264236821281,0.011309666752328078,0.0093011731041299,0.008119291484082148,0.008791666312076098,0.009062220759980585,0.0089126662885263,0.008420887574681,0.007849003317219152,0.007696936685052193,0.007830150279634969,0.008346101046180488,0.008552002224353485,0.0084023662890043,0.008265211801509645,0.0081383310225009
Greenland,0.0163314080860386,0.0123456790123456,0.0087108013937282,0.0099788908079064,0.0098802964088922,0.0092191909689557,0.0085756897837435,0.0138632162661737,0.0102096627164995,0.0039704024544305,-0.00161783210497...,-9.00252070579788...,0.0027031897639215,0.0050323508267432,0.0048283261802575,0.002669514148425...,0.0030173943911962,0.001769598301185...,0.001589825119236...,0.003350970017636...,0.0042186676041483,0.0040259058288115,0.003138075313807...,0.002780674313521...,0.001386481802426...,-0.00242298373139...,-0.00190839694656...,0.0005214670606640405,6.949270326615422E-4,0.0006944444444444162
Mexico,0.0237284534850443,0.0238910630182453,0.0240362697844388,0.0234077506226581,0.0224705228306713,0.021821811207725,0.0214177454726988,0.0207099155305832,0.0193810800493312,0.0185542597032586,0.018540953074093397,0.018765602592636283,0.0185914294725643,0.0177022309071708,0.0168872943214153,0.016346083967168388,0.0158537342404939,0.01491118163953018,0.013273559759428979,0.01328078521219125,0.0132130957696758,0.0121778471175825,0.012081682725583421,0.011970239319941114,0.01184560648531491,0.011738191706629444,0.011646025813235343,0.0115409358653824,0.011426360142384968,0.0113033878872016


In [25]:
columns = [str(year) for year in years]
print(columns)


['1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010']


In [26]:
from pyspark.sql.functions import lit
change = pop.select(lit(1981).alias("year"), min("1981").alias("change"))
change


year,change
1981,-0.0910633093142599


In [27]:
q5 = change.join(pop.select(col("_c0"),col("1981").alias("change")), on="change", how="inner")
q5 


change,year,_c0
-0.0910633093142599,1981,Afghanistan


In [28]:
# put it all together - add all other years


for year in columns[1:]:
    change = pop.select(lit(year).alias("year"), min(year).alias("change"))
    q5 = q5.union(change.join(pop.select(col("_c0"),col(year).alias("change")), on="change", how="inner"))
    
q5


                                                                                

change,year,_c0
-0.09106330931425992,1981,Afghanistan
-0.08017227257036874,1982,Afghanistan
-0.03514189089839...,1983,Antigua and Barbuda
-0.01752514477293...,1984,Antigua and Barbuda
-0.01409244644870...,1985,Cook Islands
-0.24587816552796307,1986,Netherlands Antilles
-0.21299638989169675,1987,Saint Helena
-0.02883631837516...,1988,Mozambique
-0.02196496533102...,1989,Somalia
-0.12816300240117076,1990,Liberia


## Q6
Do word count in pyspark. Ignore punctuation, and normalize to lower case. Accept only the characters in this set: [0-9a-zA-Z]

In [29]:
text = spark.read.text("shared/hw2/romeo-juliet-pg1777.txt")
text.limit(5)



value
This Etext file i...
cooperation with ...
Future and Shakes...
Etexts that are N...


In [30]:
# cleanup, spark way
from pyspark.sql.functions import lower, regexp_replace, trim, length

text = text\
    .withColumn("value", trim(lower("value")))\
    .withColumn("value", regexp_replace("value", "[^0-9a-z]", " "))\
    .withColumn("value", trim(regexp_replace("value", "  +", " ")))\
    .withColumn("value", regexp_replace("value", "^ *$", ""))\
    .where(length("value")>0)

text.limit(5)

value
this etext file i...
cooperation with ...
future and shakes...
etexts that are n...
this etext has ce...


In [31]:
# wordcount
from pyspark.sql.functions import split, length

wc = text.withColumn("words", split("value", " "))
wc.limit(5)


value,words
this etext file i...,"[this, etext, fil..."
cooperation with ...,"[cooperation, wit..."
future and shakes...,"[future, and, sha..."
etexts that are n...,"[etexts, that, ar..."
this etext has ce...,"[this, etext, has..."


In [32]:
from pyspark.sql.functions import explode


wc.select(explode("words").alias("word")).groupBy("word").count()

                                                                                

word,count
those,17
carnegie,10
some,58
chor,2
art,55
still,15
nourish,1
cures,1
solemnity,3
feign,1


## Extra Credit Q7
For each restaurant (‘Restaurants_in_Durham_County_NC.csv) with “status”=“ACTIVE” and “"rpt_area_desc"="Food Service", show the number of foreclosures (‘durham-nc-foreclosure-2006-2016’) within a radius of 1 mile of the restaurant’s coordinates.

In [33]:
#  from before
restaurants.limit(5)

ID,Premise_Name,Premise_Address1,Premise_Address2,Premise_City,Premise_State,Premise_Zip,Premise_Phone,Hours_Of_Operation,Opening_Date,Closing_Date,Seats,Water,Sewage,Insp_Freq,Est_Group_Desc,Risk,Smoking_Allowed,Type_Description,Rpt_Area_Desc,Status,Transitional_Type_Desc,geolocation
56060,WEST 94TH ST PUB,4711 HOPE VALLEY RD,SUITE 6C,DURHAM,NC,27707,(919) 403-0025,,1994-09-01,,60,5 - Municipal/Com...,3 - Municipal/Com...,4,Full-Service Rest...,4,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"35.9207272, -78.9..."
58123,BROOKDALE DURHAM IFS,4434 BEN FRANKLIN...,,DURHAM,NC,27704,(919) 479-9966,,2003-10-15,,350,5 - Municipal/Com...,3 - Municipal/Com...,4,Nursing Home,4,NO,16 - Institutiona...,Food Service,ACTIVE,FOOD,"36.0467802, -78.8..."
70266,SMOOTHIE KING,1125 W. NC HWY 54...,,DURHAM,NC,27707,(919) 489-7300,,2009-07-09,,7,5 - Municipal/Com...,3 - Municipal/Com...,2,Fast Food Restaurant,2,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"35.9182655, -78.9..."
97837,HAMPTON INN & SUITES,1542 N GREGSON ST,,DURHAM,NC,27701,(919) 688-8880,,2012-01-09,,100,5 - Municipal/Com...,3 - Municipal/Com...,2,Full-Service Rest...,2,NO,1 - Restaurant,Food Service,ACTIVE,FOOD,"36.0183378, -78.9..."
60690,BETTER LIVING CON...,909 GARCIA ST,,DURHAM,NC,27704,(919) 477-5825,,2008-06-02,,6,5 - Municipal/Com...,3 - Municipal/Com...,1,,0,,43 - Residential ...,Residential Care,ACTIVE,,"36.0556347, -78.9..."


In [34]:
# look only at qualifying restaurants
from pyspark.sql import functions as F

interesting = restaurants.where('Status = "ACTIVE" AND Rpt_Area_Desc = "Food Service"')\
    .select("Premise_Name", "geolocation")\
    .withColumn("point", F.split("geolocation", ","))\
    .withColumn("lat1", F.col("point").getItem(0).cast("Double"))\
    .withColumn("lon1", F.col("point").getItem(1).cast("Double"))\
    .withColumn("coordinates", F.array(F.col("lat1"), F.col("lon1")))\
    .drop("geolocation", "point")

interesting.printSchema()
interesting.limit(5)


root
 |-- Premise_Name: string (nullable = true)
 |-- lat1: double (nullable = true)
 |-- lon1: double (nullable = true)
 |-- coordinates: array (nullable = false)
 |    |-- element: double (containsNull = true)



Premise_Name,lat1,lon1,coordinates
WEST 94TH ST PUB,35.9207272,-78.9573299,"[35.9207272, -78...."
BROOKDALE DURHAM IFS,36.0467802,-78.8895483,"[36.0467802, -78...."
SMOOTHIE KING,35.9182655,-78.9593263,"[35.9182655, -78...."
HAMPTON INN & SUITES,36.0183378,-78.9060312,"[36.0183378, -78...."
KROGER R 381 MEAT...,35.9495321,-78.9211694,"[35.9495321, -78...."


In [36]:
#  load the second dataset
foreclosures = spark.read.json("shared/hw2/durham-nc-foreclosure-2006-2016.json")
foreclosures.printSchema()
foreclosures.limit(5)

root
 |-- datasetid: string (nullable = true)
 |-- fields: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- geocode: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- parcel_number: string (nullable = true)
 |    |-- year: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- record_timestamp: string (nullable = true)
 |-- recordid: string (nullable = true)



datasetid,fields,geometry,record_timestamp,recordid
foreclosure-2006-...,{217 E CORPORATIO...,"{[-78.8922549, 36...",2017-03-06T12:41:...,629979c85b1cc68c1...
foreclosure-2006-...,"{401 N QUEEN ST, ...","{[-78.895396, 35....",2017-03-06T12:41:...,e3cce8bbc3c9b804c...
foreclosure-2006-...,"{403 N QUEEN ST, ...","{[-78.8950321, 35...",2017-03-06T12:41:...,311559ebfeffe7ebc...
foreclosure-2006-...,"{918 GILBERT ST, ...","{[-78.8873774, 35...",2017-03-06T12:41:...,7ec0761bd385bab8a...
foreclosure-2006-...,"{721 LIBERTY ST, ...","{[-78.888343, 35....",2017-03-06T12:41:...,c81ae2921ffca8125...


In [37]:
#  get the foreclosure data we want
df1 = foreclosures\
    .select("recordid","fields.geocode")\
    .dropna(subset="geocode")\
    .withColumn("lat2", col("geocode").getItem(0))\
    .withColumn("lon2", col("geocode").getItem(1))

df1.limit(5).toPandas()

Unnamed: 0,recordid,geocode,lat2,lon2
0,629979c85b1cc68c1d4ee8cc351050bfe3592c62,"[36.0013755, -78.8922549]",36.001376,-78.892255
1,e3cce8bbc3c9b804cbd87e267a6ff121285274e0,"[35.995797, -78.895396]",35.995797,-78.895396
2,311559ebfeffe7ebc2a8b056a034a24298da08f3,"[35.995413, -78.8950321]",35.995413,-78.895032
3,7ec0761bd385bab8af10f682115a6eb4400740b3,"[35.9957683, -78.8873774]",35.995768,-78.887377
4,c81ae2921ffca8125c2de2fd3e3b1375388cd925,"[35.993026, -78.888343]",35.993026,-78.888343


In [38]:
# combine all 
df = interesting.crossJoin(df1)
df.limit(5)

Premise_Name,lat1,lon1,coordinates,recordid,geocode,lat2,lon2
WEST 94TH ST PUB,35.9207272,-78.9573299,"[35.9207272, -78....",629979c85b1cc68c1...,"[36.0013755, -78....",36.0013755,-78.8922549
BROOKDALE DURHAM IFS,36.0467802,-78.8895483,"[36.0467802, -78....",629979c85b1cc68c1...,"[36.0013755, -78....",36.0013755,-78.8922549
SMOOTHIE KING,35.9182655,-78.9593263,"[35.9182655, -78....",629979c85b1cc68c1...,"[36.0013755, -78....",36.0013755,-78.8922549
HAMPTON INN & SUITES,36.0183378,-78.9060312,"[36.0183378, -78....",629979c85b1cc68c1...,"[36.0013755, -78....",36.0013755,-78.8922549
KROGER R 381 MEAT...,35.9495321,-78.9211694,"[35.9495321, -78....",629979c85b1cc68c1...,"[36.0013755, -78....",36.0013755,-78.8922549


In [None]:
#!pip install haversine

In [39]:
import haversine as H
import sys

# make a function to compute foreclosures distance in miles - in Python
def distance(p1, p2, unit=H.Unit.MILES) -> float:
    try:
        return H.haversine(p1, p2, unit=unit)
    except:
        return sys.float_info.max


def distance2(lat1, lon1, lat2, lon2, unit=H.Unit.MILES) -> float:
    try:
        return H.haversine([lat1, lon1], [lat2, lon2],  unit=unit)
    except:
        return sys.float_info.max
    
                
p1 = [35.9207272, -78.9573299]
p2 = [36.0013755, -78.8922549]
print(distance(p1, p2))
print(distance2(p1[0], p1[1], p2[0], p2[1]))

p2 = [-24.0, -78.8922549]
print(distance(p1, p2, H.Unit.MILES))

print(distance(None, p2, H.Unit.MILES))
print(distance([1], [0.2, 0.3], H.Unit.MILES))
print(distance("", 0))




6.655445922744362
6.655445922744362
4140.130127121173
1.7976931348623157e+308
1.7976931348623157e+308
1.7976931348623157e+308


In [45]:
# add a distance cell to each row, filter rows withing the limit, and count restaurants 
from pyspark.sql.types import DoubleType, ArrayType
from pyspark.sql.functions import udf

# UDF of our Python function; using arrays
udf_distance = F.udf(distance, DoubleType())

result = df.withColumn("distance", udf_distance(col("coordinates"), col("geocode")).cast(DoubleType()))
result.where(col("distance")<=1).groupBy("Premise_Name").count().limit(10)


                                                                                

Premise_Name,count
DPAC 3RD FLOOR,267
W G PEARSON SCHOO...,199
COMPARE FOODS DELI,65
GSK COMMERCIAL OP...,1
MCDONALD'S 35265,17
DUKE UNIVERSITY W...,6
BLU SEAFOOD AND BAR,31
JADE BUFFET,33
HARRIS TEETER 172...,10
GRILL 46,17


In [48]:
# can also be done via individual lat,lon values...

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

udf_distance2 = udf(distance2, DoubleType())

df\
    .select("Premise_Name", udf_distance2("lat1","lon1","lat2","lon2").alias("distance"))\
    .where("distance <= 1.0")\
    .groupBy("Premise_Name").count()


                                                                                

Premise_Name,count
DPAC 3RD FLOOR,267
W G PEARSON SCHOO...,199
COMPARE FOODS DELI,65
GSK COMMERCIAL OP...,1
MCDONALD'S 35265,17
DUKE UNIVERSITY W...,6
BLU SEAFOOD AND BAR,31
JADE BUFFET,33
HARRIS TEETER 172...,10
GRILL 46,17
