# This goes through the quizzes provided in Lecture 2


### Import required libraries and data

In [1]:
import pandas as pd
import urllib3
import certifi
import bs4
import numpy as np

user_columns = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
rating_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']

users = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.user', 
    sep='|', names = user_columns)


ratings = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 
    sep='\t', names=rating_columns)

#the movie data contains other columns we're not interested in. We use usecols to specify the columns we want.
#had to set the encoding to be used because some columns weren't parsing

movies = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item', 
    sep='|', names=movie_columns, usecols=range(5), encoding = "ISO-8859-1")

movies.head()



Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


### Selecting data

In [28]:
# select the occupation column of the users dataframe
users['occupation'].head()


0    technician
1         other
2        writer
3    technician
4         other
Name: occupation, dtype: object

In [24]:
# select the occupation and age column of the users dataframe
users[['occupation', 'age']].head()

# or 
columns = ['occupation', 'age']
users[columns].head()

Unnamed: 0,occupation,age
0,technician,24
1,other,53
2,writer,23
3,technician,24
4,other,33


In [29]:
# get object at index
users.iloc[3]

user_id                4
age                   24
sex                    M
occupation    technician
zip_code           43537
Name: 3, dtype: object

#### Filtering Data

In [5]:
# get all users older than 25
users[users.age > 25]


Unnamed: 0,user_id,age,sex,occupation,zip_code
1,2,53,F,other,94043
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,05201
...,...,...,...,...,...
936,937,48,M,educator,98072
937,938,38,F,technician,55038
938,939,26,F,student,33319
939,940,32,M,administrator,02215


 #### Quiz 1
 * Filter for users aged 40 and male
 * Show the mean age of female programmers

In [9]:
# users aged 40 and male

users[(users.age == 40) & (users.sex == 'M')]

Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232
199,200,40,M,programmer,93402
283,284,40,M,executive,92629
289,290,40,M,engineer,93550
308,309,40,M,scientist,70802
357,358,40,M,educator,10022
397,398,40,M,other,60008
564,565,40,M,student,55422


In [3]:
# users who are female and programmers

female_programmers = users[(users.sex == 'F') & (users.occupation == 'programmer')]

# mean age
female_programmers.age.mean()

32.166666666666664

## Split-Apply-Combine

Here we group data by some category, perform a function on each of the groups and return a summary

In [53]:
# get the number of ratings per user
grouped_data = ratings['movie_id'].groupby(ratings['user_id'])

## count and combine
ratings_per_user = grouped_data.count()

ratings_per_user.head(5)

user_id
1    272
2     62
3     54
4     24
5    175
Name: movie_id, dtype: int64

### Quiz 2 
* get the average rating per movie
* advanced: get the movie titles with the highest average rating

In [7]:
# average rating per movie 

grouped_ratings = ratings['rating'].groupby(ratings['movie_id'])
avg_per_movie = grouped_ratings.mean()

print(avg_per_movie)

movie_id
1       3.878319
2       3.206107
3       3.033333
4       3.550239
5       3.302326
          ...   
1678    1.000000
1679    3.000000
1680    2.000000
1681    3.000000
1682    3.000000
Name: rating, Length: 1682, dtype: float64


In [8]:
#highest average movie rating

max_rating = avg_per_movie.max()

print(max_rating)
# ids of movies with the highest rating
movie_ids = avg_per_movie[avg_per_movie == max_rating].index


movies[movies.movie_id.isin(movie_ids)].title

5.0


813                         Great Day in Harlem, A (1994)
1121                       They Made Me a Criminal (1939)
1188                                   Prefontaine (1997)
1200           Marlene Dietrich: Shadow and Light (1996) 
1292                                      Star Kid (1997)
1466                 Saint of Fort Washington, The (1993)
1499                            Santa with Muscles (1996)
1535                                 Aiqing wansui (1994)
1598                        Someone Else's America (1995)
1652    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

Given the unfamiliar nature of the movies with the highest rating, it seems these are movies with less ratings. Let's confirm

In [9]:
rating_count = grouped_ratings.count()

rating_count[avg_per_movie == max_rating]

movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64

These movies had 3 ratings each or less.
Let's find the max rating for movies with more than 100 ratings.

In [10]:
valid_movies = rating_count[rating_count > 100].index

valid_ratings = ratings[(ratings.movie_id.isin(valid_movies))]

valid_avg_ratings = valid_ratings['rating'].groupby(valid_ratings['movie_id']).mean()

# get max rating

max_rating = valid_avg_ratings.max()

print(max_rating)

highest_rated_movie = valid_avg_ratings[valid_avg_ratings == max_rating].index

movies[movies.movie_id.isin(highest_rated_movie)]

4.491071428571429


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
407,408,"Close Shave, A (1995)",28-Apr-1996,,http://us.imdb.com/M/title-exact?Close%20Shave...


In [36]:
top_ten = valid_ratings.groupby('movie_id').agg({'rating': np.mean}).sort_values('rating', ascending=False)[:10]
top_ten_movies = pd.merge(top_ten, movies, left_on='movie_id', right_on='movie_id' )
top_ten_movies

Unnamed: 0,movie_id,rating,title,release_date,video_release_date,imdb_url
0,408,4.491071,"Close Shave, A (1995)",28-Apr-1996,,http://us.imdb.com/M/title-exact?Close%20Shave...
1,318,4.466443,Schindler's List (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Schindler's%2...
2,169,4.466102,"Wrong Trousers, The (1993)",01-Jan-1993,,http://us.imdb.com/M/title-exact?Wrong%20Trous...
3,483,4.45679,Casablanca (1942),01-Jan-1942,,http://us.imdb.com/M/title-exact?Casablanca%20...
4,64,4.44523,"Shawshank Redemption, The (1994)",01-Jan-1994,,http://us.imdb.com/M/title-exact?Shawshank%20R...
5,603,4.38756,Rear Window (1954),01-Jan-1954,,http://us.imdb.com/M/title-exact?Rear%20Window...
6,12,4.385768,"Usual Suspects, The (1995)",14-Aug-1995,,http://us.imdb.com/M/title-exact?Usual%20Suspe...
7,50,4.358491,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...
8,178,4.344,12 Angry Men (1957),01-Jan-1957,,http://us.imdb.com/M/title-exact?12%20Angry%20...
9,134,4.292929,Citizen Kane (1941),01-Jan-1941,,http://us.imdb.com/M/title-exact?Citizen%20Kan...


Lol, never watched this movie either. Maybe I should!!!

Let's get the movies with average ratings higher than 4

In [51]:
highest_rated_movies = valid_avg_ratings[valid_avg_ratings >= 4].index

movies[movies.movie_id.isin(highest_rated_movies)]

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
11,12,"Usual Suspects, The (1995)",14-Aug-1995,,http://us.imdb.com/M/title-exact?Usual%20Suspe...
21,22,Braveheart (1995),16-Feb-1996,,http://us.imdb.com/M/title-exact?Braveheart%20...
22,23,Taxi Driver (1976),16-Feb-1996,,http://us.imdb.com/M/title-exact?Taxi%20Driver...
47,48,Hoop Dreams (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Hoop%20Dreams...
49,50,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...
...,...,...,...,...,...
650,651,Glory (1989),01-Jan-1989,,http://us.imdb.com/M/title-exact?Glory%20(1989)
653,654,Chinatown (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Chinatown%20(...
656,657,"Manchurian Candidate, The (1962)",01-Jan-1962,,http://us.imdb.com/M/title-exact?Manchurian%20...
658,659,Arsenic and Old Lace (1944),01-Jan-1944,,http://us.imdb.com/M/title-exact?Arsenic%20and...


### Passing a function

You can also pass a function to a data frame

In [54]:
average_ratings = grouped_data.apply(lambda f: f.mean())
average_ratings.head()

user_id
1    136.500000
2    249.500000
3    318.814815
4    291.041667
5    291.291429
Name: movie_id, dtype: float64

### Quiz 

List all the occupations and which ones have more females than males

In [60]:
grouped_users = users['sex'].groupby(users['occupation'])
grouped_users.apply(lambda f: sum(f == 'F') > sum(f == 'M'))

occupation
administrator    False
artist           False
doctor           False
educator         False
engineer         False
entertainment    False
executive        False
healthcare        True
homemaker         True
lawyer           False
librarian         True
marketing        False
none             False
other            False
programmer       False
retired          False
salesman         False
scientist        False
student          False
technician       False
writer           False
Name: sex, dtype: bool

## Data Scraping With Python

In [2]:
url = 'http://www.crummy.com/software/BeautifulSoup'


http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
response = http.request('GET', url)

data = response.data.decode('utf-8')

data.count('Soup')

49

In [4]:

## get bs4 object
soup = bs4.BeautifulSoup(data)
 
## compare the two print statements
#print(soup)
#print(soup.prettify())

## show how to find all a tags
soup.findAll('a')

[<a href="#Download">Download</a>,
 <a href="bs4/doc/">Documentation</a>,
 <a href="#HallOfFame">Hall of Fame</a>,
 <a href="enterprise.html">For enterprise</a>,
 <a href="https://code.launchpad.net/beautifulsoup">Source</a>,
 <a href="https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG">Changelog</a>,
 <a href="https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup">Discussion group</a>,
 <a href="zine/">Zine</a>,
 <a href="bs4/download/"><h1>Beautiful Soup</h1></a>,
 <a href="http://lxml.de/">lxml</a>,
 <a href="http://code.google.com/p/html5lib/">html5lib</a>,
 <a href="bs4/doc/">Read more.</a>,
 <a href="https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&amp;utm_medium=referral&amp;utm_campaign=enterprise" target="_blank">
 <span class="cta">
   Beautiful Soup for enterprise available via Tidelift
  </span>
 </a>,
 <a href="https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup">the discussion
 gr

In [5]:
## get all links in the page
link_list = [l.get('href') for l in soup.findAll('a')]
link_list

['#Download',
 'bs4/doc/',
 '#HallOfFame',
 'enterprise.html',
 'https://code.launchpad.net/beautifulsoup',
 'https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'zine/',
 'bs4/download/',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'bs4/doc/',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'https://tidelift.com/security',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website',
 'zine/',
 None,
 'bs4/download/',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'download/3.x/BeautifulSoup-3.2.2.tar.gz',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_sourc

Some links are internal so we have to filter for only external links

In [6]:
external_links = []

for l in link_list:
    if l is not None and l[:4] == 'http':
        external_links.append(l)

external_links

['https://code.launchpad.net/beautifulsoup',
 'https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'https://tidelift.com/security',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_source=pypi-beautifulsoup&utm_medium=referral&utm_campaign=website',
 'http://www.nytimes.com/2007/10/25/arts/design/25vide.html',
 'https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f

In [6]:
# or 
# [l for l in link_list if l is not None and l.startswith('http')]

### Parsing the Tree

In [7]:
# this shows how BeautifulSoup parses the elements in an html tree

s = """<!DOCTYPE html><html><head><title>This is a title</title></head><body><h3> Test </h3><p>Hello world!</p></body></html>"""
## get bs4 object
tree = bs4.BeautifulSoup(s)

## get html root node
root_node = tree.html

## get head from root using contents
head = root_node.contents[0]

## get body from root
body = root_node.contents[1]

## could directly access body
tree.body

<body><h3> Test </h3><p>Hello world!</p></body>

### Quiz
* Find the h3 tag by parsing the tree starting at body
* Create a list of all Hall of Fame entries listed on the Beautiful Soup webpage
* hint: it is the only unordered list in the page (tag ul)

In [8]:
tree.body.h3

<h3> Test </h3>

In [13]:
entries = soup.find('ul').contents[1:]

hall_of_fame = []
for li in entries:
    hall_of_fame.append(li.contents)
    
hall_of_fame

[[<a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable
   Type"</a>,
  ', a work of digital art on display in the lobby of the New\n York Times building, uses Beautiful Soup to scrape news feeds.\n\n'],
 ['Reddit uses Beautiful Soup to ',
  <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse
  a page that's been linked to and find a representative image</a>,
  '.\n\n'],
 ['Alexander Harrowell uses Beautiful Soup to ',
  <a href="http://www.harrowell.org.uk/viktormap.html">track the business
   activities</a>,
  ' of an arms merchant.\n\n'],
 ['The developers of Python itself used Beautiful Soup to ',
  <a href="http://svn.python.org/view/tracker/importer/">migrate the Python
  bug tracker from Sourceforge to Roundup</a>,
  '.\n\n'],
 ['The ',
  <a href="http://www2.ljworld.com/">Lawrence Journal-World</a>,
  '\nuses Beautiful Soup to ',
  <a href="http://www.b-list.org/weblog/2010/nov/02/news-done-bro

In [17]:
# as hall of fame is a list of lists convert it into a list of strings

tmp_data = ["".join(str(l) for l in entry) for entry in hall_of_fame]
tmp_data

['<a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable\n Type"</a>, a work of digital art on display in the lobby of the New\n York Times building, uses Beautiful Soup to scrape news feeds.\n\n',
 'Reddit uses Beautiful Soup to <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse\na page that\'s been linked to and find a representative image</a>.\n\n',
 'Alexander Harrowell uses Beautiful Soup to <a href="http://www.harrowell.org.uk/viktormap.html">track the business\n activities</a> of an arms merchant.\n\n',
 'The developers of Python itself used Beautiful Soup to <a href="http://svn.python.org/view/tracker/importer/">migrate the Python\nbug tracker from Sourceforge to Roundup</a>.\n\n',
 'The <a href="http://www2.ljworld.com/">Lawrence Journal-World</a>\nuses Beautiful Soup to <a href="http://www.b-list.org/weblog/2010/nov/02/news-done-broke/">gather\nstatewide election results</a>.\n\n',
 'The <a h