In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import unicodedata
import re
import matplotlib.pyplot as plt
import csv
import json

### --- Retrieve the teams we need to find
team_names = []
with open('rugby.json') as json_file:
    data = json.load(json_file)
    for team in data['teams']:
        team_names.append(team['name'])

# Parse that into a string format
teams_regex_format = '('
for team in team_names:
    teams_regex_format = teams_regex_format + team + '|'
    
    
### --- Regex Formats
teams_regex_format = teams_regex_format.strip('|') + ')'
scores_regex_format = "\s[0-9]?[0-9]?[0-9]-[0-9][0-9]?[0-9]?"
integer_regex_format = '[0-9]+'


### --- Initial page to crawl
page_limit = 300
seed_url = 'http://comp20008-jh.eng.unimelb.edu.au:9889/main/index.html'
# get the page, parse into bs
page = requests.get(seed_url)
soup = BeautifulSoup(page.text, 'html.parser')


### --- Remove index page, mark as visited
# find all links on this page
links = soup.findAll('a')
# find all links within that set that are index links
seed_links = soup.findAll('a', href=re.compile("^index.html"))


### --- Pages to visit that are not index pages
to_visit_relative = [l for l in links if l not in seed_links]
to_visit = []
# Turn to_visit links into absolute urls
for link in to_visit_relative:
    to_visit.append(urljoin(seed_url, link['href']))

    
### --- Keep track of the visited pages
visited = {}
visited[seed_url] = True
pages_visited = 1


### --- Here are the things we are looking for
headlines = {}
teams = {}
highest_scores = {}
game_differences = {}

    
### Recursively visit all links
while (to_visit):
    
    # avoid breaking the site 
    if pages_visited == page_limit:
        break
        
    ### Get a link from the list and mark it as visited
    link = to_visit.pop(0)
    # mark the item as visited, i.e., add to visited list
    visited[link] = True
    # need to concat with base_url
    page = requests.get(link)
    
    ### Scrape the link
    # parse the page content into BeautifulSoup
    soup = BeautifulSoup(page.text, 'html.parser')
        
    # TASK 1 - Find the headline of each article
    headline = soup.find(id='headline')
    headlines[link] = headline.text
    
    # locate the body in preparation for task 2
    body = soup.find('body').text.strip()
    
    # TASK 2 - Find the first team in each article
    teams_found = re.findall(teams_regex_format, body)
    # find the first team in the text of the body
    try:
        first_team = re.findall(teams_regex_format, teams_found[0])
        teams[link] = first_team.pop(0)
    except Exception as e:
        pass
        
    
    ### TASK 2 - Find the largest match score in each article
    max_score = -1
    game_difference = -1
    scores_found = re.findall(scores_regex_format, body)
    # compare every score found
    for string in scores_found:
        scores = re.findall(integer_regex_format, string)
        for score in scores:
            if (int(score) > max_score):
                max_score = int(score)
                game_difference = abs(int(scores[0]) - int(scores[1]))
                
    if max_score != -1:
        highest_scores[link] = max_score
        game_differences[link] = game_difference
        
    
    ### TASK 3 - Find the average game difference for each team
        
    
    ### Find more links to crawl
    new_links = soup.findAll('a')
    for new_link in new_links:
        new_item = new_link['href']
        new_url = urljoin(link, new_item)
        if new_url not in visited and new_url not in to_visit:
            to_visit.append(new_url)
    
    pages_visited = pages_visited + 1

    
# Finished crawling
#print('\nvisited {0:5d} pages; {1:5d} pages in to_visit'.format(len(visited), len(to_visit)))


##### TASK 1 - write headlines into task1.csv file
# Turn headlines into a pandas Series
headlines = pd.Series(headlines)
headlines = headlines.sort_index()

# create file task1.csv
with open('task1.csv', 'w') as task1:
    writer = csv.writer(task1)
    writer.writerow(['url','headline'])
    for key in headlines.keys():
        writer.writerow([key, headlines[key]])
    
    
##### TASK 2 - create a table with urls, headlines, teams and scores
# Turn teams into a pandas Series
teams = pd.Series(teams)
teams = teams.sort_index()

# Turn highest_scores into a pandas Series
highest_scores = pd.Series(highest_scores)
highest_scores = highest_scores.sort_index()

# Prune out missing data and create file task2.csv
task2 = pd.DataFrame({'headline':headlines, 'team':teams, 'score':highest_scores})
task2.index.name = 'url'
task2 = task2[task2.team.notnull()]
task2 = task2[task2.score.notnull()]
task2.to_csv(r'task2.csv')


##### TASK 3 - find the average difference
game_differences = pd.Series(game_differences)
game_differences = game_differences.sort_index()

df = pd.DataFrame({'team':teams, 'game_difference':game_differences})
df = df[df.team.notnull()]
df = df[df.game_difference.notnull()]
grouped = df.groupby(df.team)

team_avg = {}
with open('task3.csv', 'w') as task3:
    writer = csv.writer(task3)
    writer.writerow(['team', 'avg_game_differece'])
    for team in team_names:
        group = grouped.get_group(team)
        average = group['game_difference'].mean()
        writer.writerow([team, str(average)])
        team_avg[team] = str(average)

#### TASK 4 - find the 5 most frequently mentioned teams
task4 = teams.value_counts()
try:
    task4 = task4[:5]
except Exception as e:
    pass

plt.xticks(rotation='vertical')
plt.bar(task4.index, task4.values)
plt.xlabel('Team name')
plt.ylabel('Number of appearances')
plt.title("Top 5 most frequently mentioned teams")
plt.show()


##### TASK 5 - teams' frequency of mention vs avg_game_difference
team_freq = teams.value_counts()
team_avg = pd.Series(team_avg)

task5 = pd.DataFrame({'team_frequency':team_freq, 'team_average':team_avg})
print(task5)