In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Make a request to the website
r = requests.get('https://bola.kompas.com/')

In [3]:
# Create an object to parse the HTML format
soup = BeautifulSoup(r.content, 'html.parser')

In [4]:
# Retrieve all popular news links (Fig. 1)
link = []
for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])

In [5]:
# For each link, we retrieve paragraphs from it, combine each paragraph as one string, and save it to documents (Fig. 2)
documents = []
for i in link:
    # Make a request to the link
    r = requests.get(i)
  
    # Initialize BeautifulSoup object to parse the content 
    soup = BeautifulSoup(r.content, 'html.parser')
  
    # Retrieve all paragraphs and combine it as one
    sen = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        sen.append(i.text)
  
    # Add the combined paragraphs to documents
    documents.append(' '.join(sen))

In [6]:
import re
import string
documents_clean = []
for d in documents:
    # Remove Unicode
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    # Lowercase the document
    document_test = document_test.lower()
    # Remove punctuations
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
    # Lowercase the numbers
    document_test = re.sub(r'[0-9]', '', document_test)
    # Remove the doubled space
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    documents_clean.append(document_test)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

docs = documents
# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# It fits the data and transform it as a vector
X = vectorizer.fit_transform(docs)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [8]:
import numpy as np
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  # Convert the query become a vector
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  # Calculate the similarity
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  # Print the articles and their similarity values
  for k, v in sim_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(docs[k])
      print()
# Add The Query
q1 = 'barcelona'
# Call the function
get_similar_articles(q1, df)

query: barcelona
Berikut artikel dengan nilai cosine similarity tertinggi: 
Nilai Similaritas: 0.2572893059052084
KOMPAS.com - Kabar lanjutan dari grande partita  AC Milan vs  Juventus menjadi atensi dari pembaca  Bola.Kompas.com pada Kamis (7/1/2021). Bukan megabintang Cristiano Ronaldo, melainkan Sang Permata, Paulo Dybala, yang mencuri perhatian dalam kemenangan Juventus dengan skor 3-1 itu. Demikian diutarakan oleh legenda Juventus,  Alessandro Del Piero. Baca juga: Tampil Buruk Lawan AC Milan, Cristiano Ronaldo Panen Nilai 5 Lanjutan dari laga besar  AC Milan vs Juventus itu pun berlanjut terkait kinerja sang pengadil alias wasit di atas lapangan. Beberapa media Italia bahkan memberi penilaian jelek terhadap kinerja wasit asal Florence dalam partai tersebut. Adapun di luar laga dua raksasa Italia itu, di ranah Liga Spanyol, aksi Messi bersama  Barcelona kembali menjadi perhatian. Dalam kemenangan Barcelona atas Athletic Bilbao, Messi kembali membuat rekor. Berikut ini beberapa art

In [9]:
from nltk.parse.generate import generate, demo_grammar
from nltk import CFG
grammar = CFG.fromstring(demo_grammar)
print(grammar)

Grammar with 13 productions (start state = S)
    S -> NP VP
    NP -> Det N
    PP -> P NP
    VP -> 'slept'
    VP -> 'saw' NP
    VP -> 'walked' PP
    Det -> 'the'
    Det -> 'a'
    N -> 'man'
    N -> 'park'
    N -> 'dog'
    P -> 'in'
    P -> 'with'


In [10]:
for sentence in generate(grammar, n=10):
    print(' '.join(sentence))

the man slept
the man saw the man
the man saw the park
the man saw the dog
the man saw a man
the man saw a park
the man saw a dog
the man walked in the man
the man walked in the park
the man walked in the dog


In [11]:
for sentence in generate(grammar, depth=4):
    print(' '.join(sentence))

the man slept
the park slept
the dog slept
a man slept
a park slept
a dog slept
