In [2]:
from urllib.request import urlopen, Request

from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from afinn import Afinn

import urllib.robotparser
import math


# Returns true if user crawler can fetch the URL, false otherwise
def can_crawl(url):
    try:
        rp = urllib.robotparser.RobotFileParser()
        if url.endswith('/'):
            rp.set_url(url + 'robots.txt')
            robot_txt = url + 'robots.txt'
            rp.read()
        else:
            rp.set_url(url + '/robots.txt')
            robot_txt = url + '/robots.txt'
            rp.read()

        print("Permission to crawl " + str(url) + " : " + str(rp.can_fetch('*', robot_txt)))

        if rp.can_fetch('*', robot_txt):
            return True
        else:
            return False

    except Exception as e:
        print("Something went wrong reading the robots.txt file...")
        return False


# Takes in a URL and returns all hyperlink html tags "a"
def visit_url(url):
    try:
        req = Request(url)
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'html.parser')
        return soup.find_all('a')

    except Exception as e:
        print("Request failed")
        return []


# Extracts links from the provided URL for a total of n files
# Returns set of URLs visited
def extract_links(url, n):
    counter = 0
    visited_list = set()
    open_list = {url}

    while len(open_list) > 0 and counter < n:
        link = open_list.pop()
        print(link)
        new_links = visit_url(link)
        visited_list.add(link)

        counter = counter + 1

        for l in new_links:
            try:
                file_name = l['href']

                # Making sure only given host is scraped
                if file_name.startswith('http') and not file_name.startswith(url):
                    continue

                if file_name.startswith('mailto') or file_name.startswith('tel'):
                    continue

                url_new = file_name

                if not file_name.startswith(url):
                    url_new = url + file_name

                if url_new not in visited_list:
                    open_list.add(url_new)

            except Exception as e:
                pass

    return visited_list


# Takes in a set of URLs and returns their text content
def read_urls(urls):
    documents = []
    for url in urls:
        try:
            req = Request(url)
            page = urlopen(req).read()
            soup = BeautifulSoup(page, 'html.parser')
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]

            # Remove excessive white spaces from text
            doc = re.sub(r'\s+', ' ', soup.get_text())

            # Remove URLs from text
            doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', doc)
            documents.append(doc)

        except Exception as e:
            print("Something went wrong reading the URLs - " + str(e))
            pass

    # print(documents)
    print("Total amount of documents processed: " + str(len(documents)))

    return documents


# Takes list of documents to perform clustering and sentiment analysis operations on.
# k number of clusters
# 50 iterations
# Sentiment analysis for each cluster is calculated by taking the 15 most popular words in each cluster
# and averaging the sentiment score
def perform_k_means_clustering_and_sentiment_analysis(docs, k):
    afinn = Afinn()

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(docs)
    model = KMeans(n_clusters=k, init='k-means++', max_iter=50, n_init=1)
    model.fit(X)

    print("Top terms per cluster:")
    ordered_centroids = model.cluster_centers_.argsort()[:, ::-1]
    try:
        terms = vectorizer.get_feature_names_out()  # Changed from get_feature_names to get_feature_names_out
    except Exception as e:
        print("Error getting feature names: ", e)
        return  # Exit the function if terms cannot be retrieved

    for i in range(k):
        sentiment_score = 0
        print("Cluster %d:" % i)

        for index in ordered_centroids[i, :50]:  # Print and score the top 50 terms, or as many as are available
            if index < len(terms):  # Check if the index is within the range of the 'terms' list
                term = terms[index]
                term_score = afinn.score(term)
                print('%s - %f' % (term, term_score))
                sentiment_score += term_score
            else:
                break  # Break the loop if there are no more terms to process

        print("Cluster %d sentiment score: %f" % (i, sentiment_score))
    

if __name__ == '__main__':
    # Please ensure link is of format 'http(s)://[host]'
    url = "https://concordia.ca"

    if can_crawl(url):
        docs = read_urls(extract_links(url, 30))
        
        # First run with k=3
        print("Running clustering with k=3")
        perform_k_means_clustering_and_sentiment_analysis(docs, 3)

        # Second run with k=6
        print("Running clustering with k=6")
        perform_k_means_clustering_and_sentiment_analysis(docs, 6)

    else:
        print("Cannot crawl " + str(url) + ". Terminating ... ")
        exit()

Permission to crawl https://concordia.ca : True
https://concordia.ca
https://concordia.ca/offices.html
https://concordia.ca/offices/facilities.html
https://concordia.ca/artsci/applied-human-sciences.html
https://concordia.ca/about/history.html
https://concordia.ca/web/accessibility.html
https://concordia.ca/maps/buildings/pc.html
https://concordia.ca/artsci/applied-human-sciences/about/jobs.html
https://concordia.ca/alumni-friends.html
https://concordia.ca/artsci/alumni.html
https://concordia.ca/research/composites.html
https://concordia.ca/cce.html
https://concordia.ca/academics/experiential-learning.html
https://concordia.ca/cuevents/offices/provost/otsenhakta/2023/11/30/indigenous-holiday-market.html
https://concordia.ca/ginacody/ciadi.html
https://concordia.ca/artsci/english.html
https://concordia.ca/academics/online-courses.html
https://concordia.ca/jmsb/about/departments/marketing.html
https://concordia.ca/research/for-researchers.html
https://concordia.ca/research/polanyi.html
h

In [2]:
from urllib.request import urlopen, Request

from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from afinn import Afinn

import urllib.robotparser
import math


# Returns true if user crawler can fetch the URL, false otherwise
def can_crawl(url):
    try:
        rp = urllib.robotparser.RobotFileParser()
        if url.endswith('/'):
            rp.set_url(url + 'robots.txt')
            robot_txt = url + 'robots.txt'
            rp.read()
        else:
            rp.set_url(url + '/robots.txt')
            robot_txt = url + '/robots.txt'
            rp.read()

        print("Permission to crawl " + str(url) + " : " + str(rp.can_fetch('*', robot_txt)))

        if rp.can_fetch('*', robot_txt):
            return True
        else:
            return False

    except Exception as e:
        print("Something went wrong reading the robots.txt file...")
        return False


# Takes in a URL and returns all hyperlink html tags "a"
def visit_url(url):
    try:
        req = Request(url)
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'html.parser')
        return soup.find_all('a')

    except Exception as e:
        print("Request failed")
        return []


# Extracts links from the provided URL for a total of n files
# Returns set of URLs visited
def extract_links(url, n):
    counter = 0
    visited_list = set()
    open_list = {url}

    while len(open_list) > 0 and counter < n:
        link = open_list.pop()
        print(link)
        new_links = visit_url(link)
        visited_list.add(link)

        counter = counter + 1

        for l in new_links:
            try:
                file_name = l['href']

                # Making sure only given host is scraped
                if file_name.startswith('http') and not file_name.startswith(url):
                    continue

                if file_name.startswith('mailto') or file_name.startswith('tel'):
                    continue

                url_new = file_name

                if not file_name.startswith(url):
                    url_new = url + file_name

                if url_new not in visited_list:
                    open_list.add(url_new)

            except Exception as e:
                pass

    return visited_list


# Takes in a set of URLs and returns their text content
def read_urls(urls):
    documents = []
    for url in urls:
        try:
            req = Request(url)
            page = urlopen(req).read()
            soup = BeautifulSoup(page, 'html.parser')
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]

            # Remove excessive white spaces from text
            doc = re.sub(r'\s+', ' ', soup.get_text())

            # Remove URLs from text
            doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', doc)
            documents.append(doc)

        except Exception as e:
            print("Something went wrong reading the URLs - " + str(e))
            pass

    # print(documents)
    print("Total amount of documents processed: " + str(len(documents)))

    return documents

def perform_k_means_clustering_and_sentiment_analysis(docs, k, file_name):
    afinn = Afinn()
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(docs)
    model = KMeans(n_clusters=k, init='k-means++', max_iter=50, n_init=1)
    model.fit(X)

    print("Top terms per cluster:")
    ordered_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()

    with open(file_name, 'w') as file:
        for i in range(k):
            file.write(f"Cluster {i}:\n")
            for index in ordered_centroids[i, :20]:  # Capture top 20 terms
                term = terms[index]
                file.write(f"{term}\n")
            file.write("\n")
if __name__ == '__main__':
    url = "https://concordia.ca"

    if can_crawl(url):
        docs = read_urls(extract_links(url, 30))
        
        # Clustering with k=3 and writing to a file
        perform_k_means_clustering_and_sentiment_analysis(docs, 3, "cluster_3_terms.txt")

        # Clustering with k=6 and writing to a file
        perform_k_means_clustering_and_sentiment_analysis(docs, 6, "cluster_6_terms.txt")

    else:
        print("Cannot crawl " + str(url) + ". Terminating ... ")
        exit()


Permission to crawl https://concordia.ca : True
https://concordia.ca
https://concordia.ca/about/community/office/projects/streets-cafe.html
https://concordia.ca/contact.html
https://concordia.ca/offices/ci.html
https://concordia.ca/news/media-relations.html
https://concordia.ca/admissions/undergraduate.html
https://concordia.ca/maps/sgw-campus.html
https://concordia.ca/admissions/undergraduate/quebec.html
https://concordia.ca/about/administration-governance/president.html#tuition
https://concordia.ca/web/terms.html
https://concordia.ca/news/media-relations/team/patrick-lejtenyi.html#releases
https://concordia.ca/fr/admission/etudes-au-1er-cycle.html
https://concordia.ca/content/concordia/en/students/financial
https://concordia.ca/research/chairs.html
https://concordia.ca/students/financial/tuition-fees.html
https://concordia.ca/coronavirus.html
https://concordia.ca/news/stories/2023/12/05/workplace-culture-is-preventing-men-from-taking-paternity-leave-writes-claudine-mangen.html
https: