In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Make a request to the website
r = requests.get('https://bola.kompas.com/')

In [3]:
# Create an object to parse the HTML format
soup = BeautifulSoup(r.content, 'html.parser')

In [4]:
# Retrieve all popular news links (Fig. 1)
link = []
for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])

In [5]:
# For each link, we retrieve paragraphs from it, combine each paragraph as one string, and save it to documents (Fig. 2)
documents = []
for i in link:
    # Make a request to the link
    r = requests.get(i)
  
    # Initialize BeautifulSoup object to parse the content 
    soup = BeautifulSoup(r.content, 'html.parser')
  
    # Retrieve all paragraphs and combine it as one
    sen = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        sen.append(i.text)
  
    # Add the combined paragraphs to documents
    documents.append(' '.join(sen))

In [7]:
import re
import string
documents_clean = []
for d in documents:
    # Remove Unicode
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    # Lowercase the document
    document_test = document_test.lower()
    # Remove punctuations
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
    # Lowercase the numbers
    document_test = re.sub(r'[0-9]', '', document_test)
    # Remove the doubled space
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    documents_clean.append(document_test)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

docs = documents
# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# It fits the data and transform it as a vector
X = vectorizer.fit_transform(docs)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [14]:
import numpy as np
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  # Convert the query become a vector
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  # Calculate the similarity
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  # Print the articles and their similarity values
  for k, v in sim_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(docs[k])
      print()
# Add The Query
q1 = 'barcelona'
# Call the function
get_similar_articles(q1, df)

query: barcelona
Berikut artikel dengan nilai cosine similarity tertinggi: 
Nilai Similaritas: 0.09524997159901508
 KOMPAS.com - Duel  Juventus vs  Dynamo Kiev tersaji dalam rangkaian matchday kelima fase grup  Liga Champions 2020-2021. Laga  Juventus vs Dynamo Kiev berlangsung di Stadion Allianz, Kamis (3/12/2020) pukul 03.00 WIB. Link  live streaming Juventus vs Dynamo Kiev tersaji di akhir artikel.  Langkah Juventus terbilang mudah saat menjamu Dynamo Kiev. Baca juga: Juventus Vs Dynamo Kiev, Si Nyonya Tua Andalkan Alvaro Morata   Pasalnya, tim asal kota Turin itu sudah memastikan diri lolos ke babak 16 besar mendampingi Barcelona.    Meski demikian, status juara Grup G masih menjadi rebutan antara Juventus dan Barcelona. Juventus yang saat ini menghuni urutan kedua dengan koleksi sembila poin, masih berpeluang untuk finis sebagai juara grup. Bianconeri terpaut tiga angka dari Barcelona yang bertengger di puncak klasemen. Memetik tiga poin saat menjamu Dynamo Kiev sangat penting bag