In [None]:
%matplotlib inline
import csv
# file and exercise from https://github.com/brandon-rhodes/pycon-pandas-tutorial
with open('titles.csv', 'r') as f:
    titles = list(csv.reader(f))

#len(titles)
titles = titles[1:]
titles = [(year, title) for title, year in titles]
titles

In [None]:
import matplotlib.pyplot as plt
from itertools import groupby


hamlets = [(int(year), title) for year, title in titles if 'hamlet' in title.lower()]
hamlets = sorted(hamlets)
hamlets_per_decade = [(decade, len(list(g))) for decade, g in groupby(hamlets, lambda row: row[0] // 10 * 10)]

decades, counts = zip(*hamlets_per_decade)
plt.bar(decades, counts, 5)

What have we done?
* filter by name
* sort by year
* group by decade
* count
* split in x, y
* plot result

In [None]:
import matplotlib.pyplot as plt
from itertools import groupby

def get_decade(row):
    year, title = row
    return int(year) // 10 * 10

def contains_hamlet(row):
    year, title = row
    return 'hamlet' in title.lower()

filtered_titles = [row for row in titles if contains_hamlet(row)]
sorted_titles = sorted(filtered_titles)
grouped_titles = groupby(sorted_titles, get_decade)
counted_titles = [(group, len(list(grouped_titles))) for group, grouped_titles in grouped_titles]

years, n_titles = zip(*counted_titles)
plt.bar(years, n_titles, 5)

In [None]:
from functional import seq

def count_titles(year_group):
    year, group = year_group
    return year, len(list(group))

counted_titles = (seq(titles)
    .filter(contains_hamlet)
    .sorted()
    .group_by(get_decade)
    .map(count_titles)
)

years, n_titles = zip(*counted_titles)
plt.bar(years, n_titles, 5, align='center')
plt.xticks(years, rotation='vertical')

In [None]:
# map-reduce hello world
from operator import add
c_titles = (seq(titles)
    .filter(contains_hamlet)
    .sorted()
    .map(lambda r: (get_decade(r), 1))
    .reduce_by_key(add)
)
 
years, n_titles = zip(*c_titles)
plt.bar(years, n_titles, 5, align='center')
plt.xticks(years, rotation='vertical')

In [None]:
# there is pandas
import pandas as pd
p_titles = pd.DataFrame.from_csv('titles.csv', index_col=None)

hamlet_titles = p_titles[p_titles.title.str.contains('(?i)hamlet')]
hamlet_titles.groupby(hamlet_titles.year // 10 * 10).size().plot(kind="bar")
# also pySpark...

Sources:
* https://github.com/brandon-rhodes/pycon-pandas-tutorial
* https://github.com/EntilZha/PyFunctional
* http://tomaugspurger.github.io/method-chaining.html