Read the file from the Azure blob container.

In [6]:

ipldata = spark.sparkContext.textFile('wasb:///example/data/matches.csv')
ipldata.take(10)


[u'1,2008,Bangalore,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Kolkata Knight Riders,140,0,BB McCullum,M Chinnaswamy Stadium,Asad Rauf,RE Koertzen,', u'2,2008,Chandigarh,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,bat,normal,0,Chennai Super Kings,33,0,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",MR Benson,SL Shastri,', u'3,2008,Delhi,2008-04-19,Rajasthan Royals,Delhi Daredevils,Rajasthan Royals,bat,normal,0,Delhi Daredevils,0,9,MF Maharoof,Feroz Shah Kotla,Aleem Dar,GA Pratapkumar,', u'4,2008,Mumbai,2008-04-20,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,normal,0,Royal Challengers Bangalore,0,5,MV Boucher,Wankhede Stadium,SJ Davis,DJ Harper,', u'5,2008,Kolkata,2008-04-20,Deccan Chargers,Kolkata Knight Riders,Deccan Chargers,bat,normal,0,Kolkata Knight Riders,0,5,DJ Hussey,Eden Gardens,BF Bowden,K Hariharan,', u'6,2008,Jaipur,2008-04-21,Kings XI Punjab,Rajasthan Royals,Kings X

In [7]:
header = 'id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3'

In [8]:
fields = header.split(",") # Split the headers into the fields
fields

['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2', 'umpire3']

In [27]:
# Filter the bad records
# Logic to filter is that split on "," and the returned array should have length less than 19
filteredIplData = ipldata.filter(lambda x: len(x.split(",")) < 19)
filteredIplData.first()

u'1,2008,Bangalore,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Kolkata Knight Riders,140,0,BB McCullum,M Chinnaswamy Stadium,Asad Rauf,RE Koertzen,'

In [9]:
import csv
from StringIO import StringIO
from collections import namedtuple

In [28]:

# Create the base class for the iplrecord
iplrecord = namedtuple('iplrecord', fields, verbose=True)


In [17]:
# create a function which consumes each row from the CSV file and converts it to the above class object
def parse(row):
    reader = csv.reader(StringIO(row))
    row=reader.next()
    return iplrecord(*row)

In [29]:
iplrecords = filteredIplData.map(parse) # run the transformation on the ipldata to return an RDD of the iplrecord objects
iplrecords.first()

iplrecord(id='1', season='2008', city='Bangalore', date='2008-04-18', team1='Kolkata Knight Riders', team2='Royal Challengers Bangalore', toss_winner='Royal Challengers Bangalore', toss_decision='field', result='normal', dl_applied='0', winner='Kolkata Knight Riders', win_by_runs='140', win_by_wickets='0', player_of_match='BB McCullum', venue='M Chinnaswamy Stadium', umpire1='Asad Rauf', umpire2='RE Koertzen', umpire3='')

In [30]:
# The benefit of the above approach is that now we can do a dot reference to an attribute
testrecord = iplrecords.first()
testrecord.toss_decision

'field'

In [31]:
zerowinbyruns = iplrecords.filter(lambda x: x.win_by_runs == 0) # Note that here we forgot to cast the x.win_by_runs as integer
zerowinbyruns.take(5) 

[]

In [32]:
zerowinbyruns = iplrecords.filter(lambda x: int(x.win_by_runs) == 0) # note the typecasting, this is to be remembered
zerowinbyruns.take(5)

[iplrecord(id='3', season='2008', city='Delhi', date='2008-04-19', team1='Rajasthan Royals', team2='Delhi Daredevils', toss_winner='Rajasthan Royals', toss_decision='bat', result='normal', dl_applied='0', winner='Delhi Daredevils', win_by_runs='0', win_by_wickets='9', player_of_match='MF Maharoof', venue='Feroz Shah Kotla', umpire1='Aleem Dar', umpire2='GA Pratapkumar', umpire3=''), iplrecord(id='4', season='2008', city='Mumbai', date='2008-04-20', team1='Mumbai Indians', team2='Royal Challengers Bangalore', toss_winner='Mumbai Indians', toss_decision='bat', result='normal', dl_applied='0', winner='Royal Challengers Bangalore', win_by_runs='0', win_by_wickets='5', player_of_match='MV Boucher', venue='Wankhede Stadium', umpire1='SJ Davis', umpire2='DJ Harper', umpire3=''), iplrecord(id='5', season='2008', city='Kolkata', date='2008-04-20', team1='Deccan Chargers', team2='Kolkata Knight Riders', toss_winner='Deccan Chargers', toss_decision='bat', result='normal', dl_applied='0', winner='

In [45]:
# select only this fields from the data -> toss_decision, win_by_runs, win_by_wickets, venue
filteredcolumns = zerowinbyruns.map(lambda x: {'toss_decision': x.toss_decision ,'win_by_wickets' : x.win_by_wickets, 'venue': x.venue})
filteredcolumns.take(5)

[{'win_by_wickets': '9', 'venue': 'Feroz Shah Kotla', 'toss_decision': 'bat'}, {'win_by_wickets': '5', 'venue': 'Wankhede Stadium', 'toss_decision': 'bat'}, {'win_by_wickets': '5', 'venue': 'Eden Gardens', 'toss_decision': 'bat'}, {'win_by_wickets': '6', 'venue': 'Sawai Mansingh Stadium', 'toss_decision': 'bat'}, {'win_by_wickets': '7', 'venue': 'M Chinnaswamy Stadium', 'toss_decision': 'field'}]

In [44]:
filteredcolumns.map(lambda x: x['venue']).countByValue()

defaultdict(<type 'int'>, {'SuperSport Park': 8, 'Newlands': 3, 'Feroz Shah Kotla': 29, 'Brabourne Stadium': 5, 'Shaheed Veer Narayan Singh International Stadium': 4, 'Saurashtra Cricket Association Stadium': 4, 'Holkar Cricket Stadium': 2, 'Green Park': 2, 'De Beers Diamond Oval': 2, 'Subrata Roy Sahara Stadium': 6, 'Sawai Mansingh Stadium': 23, 'Kingsmead': 6, 'Buffalo Park': 1, 'New Wanderers Stadium': 5, 'Nehru Stadium': 2, 'Dr DY Patil Sports Academy': 10, 'Dubai International Cricket Stadium': 4, 'JSCA International Stadium Complex': 5, 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium': 4, 'Barabati Stadium': 3, 'Wankhede Stadium': 24, 'Eden Gardens': 32, 'Maharashtra Cricket Association Stadium': 5, 'Himachal Pradesh Cricket Association Stadium': 4, 'Sheikh Zayed Stadium': 4, 'M Chinnaswamy Stadium': 34, 'Sharjah Cricket Stadium': 4, "St George's Park": 4, 'OUTsurance Oval': 1})