In [13]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [16]:
def retrieve_docs_and_clean():
    r = requests.get('https://bola.kompas.com/')
    soup = BeautifulSoup(r.content, 'html.parser')

    link = []
    for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
        i['href'] = i['href'] + '?page=all'
        link.append(i['href'])
    documents = []

    for i in link:
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')

        sen = []
        for i in soup.find('div', {'class':'read__content'}).find_all('p'):
            sen.append(i.text)
        documents.append(' '.join(sen))
        documents_clean = []
    for d in documents:
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        document_test = re.sub(r'@\w+', '', document_test)
        document_test = document_test.lower()
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        document_test = re.sub(r'[0-9]', '', document_test)
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)

    return documents_clean
        

        

In [17]:
docs = retrieve_docs_and_clean()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
#Create a dataframe
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
print(df.head())
print(df.shape)

                 0         1         2         3         4         5  \
acapkali  0.000000  0.000000  0.000000  0.000000  0.000000  0.037931   
ada       0.048771  0.000000  0.000000  0.000000  0.000000  0.000000   
adalah    0.027966  0.000000  0.000000  0.017061  0.033046  0.000000   
adapun    0.075871  0.088384  0.038426  0.000000  0.000000  0.000000   
adu       0.000000  0.000000  0.000000  0.000000  0.000000  0.037931   

                 6         7         8         9  
acapkali  0.000000  0.000000  0.000000  0.000000  
ada       0.000000  0.000000  0.034168  0.000000  
adalah    0.014312  0.057672  0.039185  0.023965  
adapun    0.000000  0.026077  0.000000  0.000000  
adu       0.000000  0.000000  0.000000  0.000000  
(924, 10)


In [18]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
acapkali,0.0,0.0,0.0,0.0,0.0,0.037931,0.0,0.0,0.0,0.0
ada,0.048771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034168,0.0
adalah,0.027966,0.0,0.0,0.017061,0.033046,0.0,0.014312,0.057672,0.039185,0.023965
adapun,0.075871,0.088384,0.038426,0.0,0.0,0.0,0.0,0.026077,0.0,0.0
adu,0.0,0.0,0.0,0.0,0.0,0.037931,0.0,0.0,0.0,0.0


In [19]:
def get_similar_articles(q, df):
    print("query:", q)
    print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
    for k, v in sim_sorted:
        if v != 0.0:
            print("Nilai Similaritas:", v)
            print(docs[k])
            print()

q1 = 'barcelona'
q2 = 'gareth bale'
q3 = 'shin tae yong'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
Berikut artikel dengan nilai cosine similarity tertinggi: 
----------------------------------------------------------------------------------------------------
query: gareth bale
Berikut artikel dengan nilai cosine similarity tertinggi: 
----------------------------------------------------------------------------------------------------
query: shin tae yong
Berikut artikel dengan nilai cosine similarity tertinggi: 
