In [1]:
sc


In [13]:
f = "datasets/spark/Dodgers.data"

In [14]:
f1 = "datasets/spark/Dodger_event.data"

In [15]:
r1 = sc.textFile(f)
r2 = sc.textFile(f1)

In [16]:
r1.take(5)

['4/10/2005 0:00,-1',
 '4/10/2005 0:05,-1',
 '4/10/2005 0:10,-1',
 '4/10/2005 0:15,-1',
 '4/10/2005 0:20,-1']

In [17]:
r2.take(5)

['04/12/05,13:10:00,16:23:00,55892,San Francisco,W 9-8�',
 '04/13/05,19:10:00,21:48:00,46514,San Francisco,W 4-1�',
 '04/15/05,19:40:00,21:48:00,51816,San Diego,W 4-0�',
 '04/16/05,19:10:00,21:52:00,54704,San Diego,W 8-3�',
 '04/17/05,13:10:00,15:31:00,53402,San Diego,W 6-0�']

In [18]:
pwd

'/home/ankita'

In [20]:
from datetime import datetime
import csv
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
from collections import namedtuple

In [44]:
#Covert the row format into key, value pair for dodgers traffic data
def parse(row):
    datetime_c = "%m/%d/%Y %H:%M"
    row = row.split(",")
    row[0] = datetime.strptime(row[0], datetime_c)
    row[1] = int(row[1])
    return (row[0], row[1])

In [45]:
#Transform r1 RDD by parsing each record by parse() to key-value pair
t_r1 = r1.map(parse)

In [46]:
t_r1

PythonRDD[30] at RDD at PythonRDD.scala:49

In [47]:
t_r1.take(5)

[(datetime.datetime(2005, 4, 10, 0, 0), -1),
 (datetime.datetime(2005, 4, 10, 0, 5), -1),
 (datetime.datetime(2005, 4, 10, 0, 10), -1),
 (datetime.datetime(2005, 4, 10, 0, 15), -1),
 (datetime.datetime(2005, 4, 10, 0, 20), -1)]

In [48]:
#Take only date time year from transformed RDD
daily_t = t_r1.map(lambda x:(x[0].date()))

In [49]:
daily_t.take(5)

[datetime.date(2005, 4, 10),
 datetime.date(2005, 4, 10),
 datetime.date(2005, 4, 10),
 datetime.date(2005, 4, 10),
 datetime.date(2005, 4, 10)]

In [50]:
#Summarize a pair of RDD
daily_t1 = t_r1.map(lambda x:(x[0].date(), x[1]))

In [51]:
daily_t1.take(5)

[(datetime.date(2005, 4, 10), -1),
 (datetime.date(2005, 4, 10), -1),
 (datetime.date(2005, 4, 10), -1),
 (datetime.date(2005, 4, 10), -1),
 (datetime.date(2005, 4, 10), -1)]

In [52]:
#reduce by key(date) and combine values 
daily_t2 = t_r1.map(lambda x:(x[0].date(), x[1])).reduceByKey(lambda x,y:x+y)

In [64]:
#sort the summarized data based on count of cars in descending order (as we want to sort based on 2nd element in tuple x[1], -ve bcz we want descending order)
daily_t2.sortBy(lambda x:-x[1]).take(10)
#result will contain top 5 days with maximum traffic

[(datetime.date(2005, 7, 28), 7661),
 (datetime.date(2005, 7, 29), 7499),
 (datetime.date(2005, 8, 12), 7287),
 (datetime.date(2005, 7, 27), 7238),
 (datetime.date(2005, 9, 23), 7175),
 (datetime.date(2005, 7, 26), 7163),
 (datetime.date(2005, 5, 20), 7119),
 (datetime.date(2005, 8, 11), 7110),
 (datetime.date(2005, 9, 8), 7107),
 (datetime.date(2005, 9, 7), 7082)]

In [65]:
#make key-value tuple of events dataset
def parsegames(row):
    date_f = "%m/%d/%y"
    row = row.split(",")
    row[0] = datetime.strptime(row[0], date_f).date()
    return (row[0], row[4])

In [66]:
t_r2 = r2.map(parsegames)

In [67]:
t_r2.take(5)

[(datetime.date(2005, 4, 12), 'San Francisco'),
 (datetime.date(2005, 4, 13), 'San Francisco'),
 (datetime.date(2005, 4, 15), 'San Diego'),
 (datetime.date(2005, 4, 16), 'San Diego'),
 (datetime.date(2005, 4, 17), 'San Diego')]

In [70]:
r1_r2 = daily_t2.leftOuterJoin(t_r2) #join the 2 rdds

In [71]:
r1_r2.take(5)

[(datetime.date(2005, 4, 11), (5062, None)),
 (datetime.date(2005, 4, 15), (6459, 'San Diego')),
 (datetime.date(2005, 4, 17), (5322, 'San Diego')),
 (datetime.date(2005, 4, 19), (6049, None)),
 (datetime.date(2005, 4, 21), (5977, None))]

In [72]:
#Function to check whether there was game on that day or regular day
def game_chk(row):
    if (row[1][1] == None):
        return (row[0], row[1][1], "Regular day", row[1][0])
    else:
        return (row[0], row[1][1], "Game day", row[1][0])

In [73]:
#Apply game_chk function to every row of r1_r2
chk_game_day = r1_r2.map(game_chk)

In [79]:
chk_game_day.take(20)

[(datetime.date(2005, 4, 11), None, 'Regular day', 5062),
 (datetime.date(2005, 4, 15), 'San Diego', 'Game day', 6459),
 (datetime.date(2005, 4, 17), 'San Diego', 'Game day', 5322),
 (datetime.date(2005, 4, 19), None, 'Regular day', 6049),
 (datetime.date(2005, 4, 21), None, 'Regular day', 5977),
 (datetime.date(2005, 4, 22), None, 'Regular day', 6038),
 (datetime.date(2005, 4, 23), None, 'Regular day', 5366),
 (datetime.date(2005, 4, 24), None, 'Regular day', 4319),
 (datetime.date(2005, 4, 25), 'Arizona', 'Game day', 6280),
 (datetime.date(2005, 4, 30), 'Colorado', 'Game day', 6090),
 (datetime.date(2005, 5, 1), 'Colorado', 'Game day', 5178),
 (datetime.date(2005, 5, 4), 'Washington', 'Game day', 6423),
 (datetime.date(2005, 5, 13), 'Atlanta', 'Game day', 6706),
 (datetime.date(2005, 5, 21), 'LA Angels', 'Game day', 5767),
 (datetime.date(2005, 6, 1), 'Chicago Cubs', 'Game day', 6520),
 (datetime.date(2005, 6, 8), 'Detroit', 'Game day', 6278),
 (datetime.date(2005, 6, 13), None, 'Reg

In [82]:
#Check whether it was game day or not on the day when there was more traffic by sorting on #of cars column
chk_game_day.sortBy(lambda x:-x[3]).take(10)

[(datetime.date(2005, 7, 28), 'Cincinnati', 'Game day', 7661),
 (datetime.date(2005, 7, 29), 'St. Louis', 'Game day', 7499),
 (datetime.date(2005, 8, 12), 'NY Mets', 'Game day', 7287),
 (datetime.date(2005, 7, 27), 'Cincinnati', 'Game day', 7238),
 (datetime.date(2005, 9, 23), 'Pittsburgh', 'Game day', 7175),
 (datetime.date(2005, 7, 26), 'Cincinnati', 'Game day', 7163),
 (datetime.date(2005, 5, 20), 'LA Angels', 'Game day', 7119),
 (datetime.date(2005, 8, 11), 'Philadelphia', 'Game day', 7110),
 (datetime.date(2005, 9, 8), None, 'Regular day', 7107),
 (datetime.date(2005, 9, 7), 'San Francisco', 'Game day', 7082)]

In [133]:
#Take total count of cars for regular day and game day
total_cars_p_day = chk_game_day.map(lambda x:(x[2],x[3])).reduceByKey(lambda x,y:x+y)

In [134]:
chk_game_day.map(lambda x:(x[2],x[3])).count()

175

In [135]:
r1.count()

50400

In [136]:
r2.count()

81

In [137]:
r1_r2.count()

175

In [138]:
re_count = chk_game_day.filter(lambda x:(x[2] == 'Regular day')).count() #count how many record with regular day

In [139]:
game_count = chk_game_day.filter(lambda x:(x[2] == 'Game day')).count() #count of records with game day

In [144]:
#calculate avg. #of cars on game day
total_cars_p_day.filter(lambda x:(x[0] == 'Game day')).map(lambda x:(x[1]/game_count)).first()

5948.604938271605

In [195]:
count_per_Day = chk_game_day.map(lambda x:(x[2],x[3])).countByKey() #generalised way to count no of records per value for a given column 

In [196]:
count_per_Day

defaultdict(int, {'Regular day': 94, 'Game day': 81})

In [193]:
k = chk_game_day.map(lambda x:x[2]).countByValue


In [197]:
k

defaultdict(int, {'Regular day': 94, 'Game day': 81})

In [199]:
total_cars_p_day.collect()

[('Regular day', 508665), ('Game day', 481837)]