In [1]:
import nltk
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import pandas as pd

# --- Helper functions ---
def check_html(link):
    return ".html" in link

def combine_texts_per_page(i, page_texts):
    page_tokens = []
    for text in page_texts:
        page_tokens.extend(word_tokenize(text))
    return list(map(lambda x: [x, i], list(set(page_tokens)))), ' '.join(page_tokens)


# --- Data containers ---
text_page_pairs = []
page_text_dic = {}

# --- Project start ---
start_url = 'https://www.concordia.ca/ginacody.html'
webpage_beginning = 'https://www.concordia.ca'

# get soup from main page
soup = BeautifulSoup(requests.get(start_url).text, "html.parser")
a_lists = soup.find_all('a')
print("Number of links found:", len(a_lists))

# scrape only .html links
for i, a_tag in enumerate(a_lists):
    href = a_tag.get('href')
    if not href:
        continue

    if check_html(href):
        if 'https' not in href:
            html_link = webpage_beginning + href
        else:
            html_link = href

        try:
            page_soup = BeautifulSoup(requests.get(html_link).text, "html.parser")
            results = combine_texts_per_page(i, [p.get_text() for p in page_soup.find_all("p")])
            text_page_pairs.extend(results[0])
            page_text_dic[i] = results[1]
        except Exception as e:
            print("Error scraping", html_link, ":", e)

Number of links found: 176


In [2]:
#create incidence matrix
import pandas as pd

dic_term_page = {'term':list(map(lambda x:x[0],text_page_pairs)), 'page':list(map(lambda x:x[1],text_page_pairs))}

# Calling DataFrame constructor on dictionary
df = pd.DataFrame(dic_term_page)

#incidence matrix with term as index, webpage as column, and present as value in order to get vectors
df['present']=1

incidence_matrix = pd.pivot_table(df, values='present', index=['term'],columns=['page']).fillna(0)

print(incidence_matrix)

page        1    2    4    5    6    7    9    10   11   12   ...  163  164  \
term                                                          ...             
!           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...  0.0  0.0   
$           0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
%           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
&           0.0  0.0  1.0  1.0  1.0  0.0  1.0  0.0  1.0  1.0  ...  0.0  1.0   
'           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
...         ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
étudiantes  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
étudiants   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
–           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
—           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
’           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

In [3]:
#run kmeans clustering  for 3 and 6
from sklearn.cluster import KMeans
import numpy as np

#list of <no of clusters,labels after clustering> pairs
k_means_set=[[3,[]],[6,[]]]

X = pd.pivot_table(df, values='present', index=['page'],columns=['term']).fillna(0)

#k=3
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
print(kmeans.labels_)
k_means_set[0][1]=kmeans.labels_

#k=6
kmeans = KMeans(n_clusters=6, random_state=0).fit(X)
print(kmeans.labels_)
k_means_set[1][1]=kmeans.labels_


[2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2
 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2
 2 2 0 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
 1 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2]
[2 2 2 2 2 2 2 0 2 2 2 2 4 3 3 2 3 2 2 2 2 2 2 2 2 2 2 2 3 3 3 2 2 2 2 2 2
 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 2 1 2 2 2 2 2 2 2 2
 2 2 0 2 2 2 2 4 3 3 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
 3 2 2 2 2 2 2 2 2 2 3 2 2 3 2 3 2 2 2 2 2 2 2 5 2 2 2 2 2]


In [4]:
#run affinn analysis for each cluster
from afinn import Afinn
import math
afinn = Afinn()

#get index positions of webpages that belong to particular cluster/label to calculate afinn score
def get_index_positions(list_of_elems, element):
    ''' Returns the indexes of all occurrences of give element in
    the list- listOfElements '''
    index_pos_list = []
    index_pos = 0
    while True:
        try:
            # Search for item in list from indexPos to the end of list
            index_pos = list_of_elems.index(element, index_pos)
            # Add the index position in list
            index_pos_list.append(index_pos)
            index_pos += 1
        except ValueError as e:
            break
    return index_pos_list


#for each type of k means with different number of cluster print the afinn score for each cluster
for k_means in k_means_set:
    print("for "+str(k_means[0])+" clusters"+'\n')
    labels_set=list(set(k_means[1]))

    for label in labels_set:
        
        label_indices=get_index_positions(list(k_means[1]),label)
        
        #pages in cluster
        clustered_pages=list(incidence_matrix.iloc[:, label_indices].columns)
        
        affinn_score=0

        for page in clustered_pages:
            affinn_score+=afinn.score(page_text_dic[page])

        print("for cluster "+str(label)+" the affinn_score is "+str(affinn_score))

        #to measure top 20 terms based on informativeness we need to create indexer for documents in each cluster then rank
        text_page_pairs_forCluster=list(filter(lambda x: x[1] in clustered_pages,text_page_pairs))
        spimi_indexer={}

        for token_stream in text_page_pairs_forCluster:
            if token_stream[0] in spimi_indexer:
                spimi_indexer[token_stream[0]].append(token_stream[1])


            elif token_stream[0] not in spimi_indexer:
                spimi_indexer[token_stream[0]]=[]
                spimi_indexer[token_stream[0]].append(token_stream[1])
        
        informativeness_results=list(spimi_indexer.items())
        informativeness_results=list(map(lambda x:  x[0],sorted(informativeness_results, key=lambda x:math.log(len(clustered_pages)/len(x[1])), reverse=True)))
        print("for cluster "+str(label)+" the top 20 terms based on informativeness is ")
        print(informativeness_results[:20])
        print('\n')


for 3 clusters

for cluster 0 the affinn_score is 96.0
for cluster 0 the top 20 terms based on informativeness is 
['Services', 'correctly', 'notified', 'check', 'funds', 'Fellowship', 'consist', '5G', 'MASc', 'complete', '30', 'association', 'having', 'Excellence', '2W1', 'Construction', 'Catherine', 'position', 'Arts', 'reception.ginacody']


for cluster 1 the affinn_score is 86.0
for cluster 1 the top 20 terms based on informativeness is 
['highly', 'â\x80\x94', 'faculty.â\x80\x9d', 'but', 'outstanding', 'commitment', '17', 'application', 'matter', 'Dean', 'wide', 'that', 'enjoys', 'world', 'reputation', 'growing', '``', '50', 'innovative', 'gender']


for cluster 2 the affinn_score is 2456.0
for cluster 2 the top 20 terms based on informativeness is 
['mistake', "department'sâ\x80¯Telephone", '300â\x80\x8b', '922', '321', '52â\x80\x8b', 'assessments', 'bike', 'â\x80\x8b', '171,131', '251,213', 'shuttle', 'bases', 'decarbonize', 'N.D.G', 'environmentally', 'hundred', 'placesâ\x80\x8