In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Target URL
url = "https://www.freecodecamp.org/news/tag/data-science/"

# Kirim request
headers = {
    "User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Ambil semua artikel yang terlihat seperti kursus/tutorial
articles = soup.select("article")

titles = []
links = []
descriptions = []

for article in articles:
    # Judul dan link
    title_tag = article.find("h2")
    if title_tag:
        title = title_tag.text.strip()
        link = "https://www.freecodecamp.org" + title_tag.find("a")["href"]
    else:
        continue

    # Deskripsi pendek (jika ada)
    desc_tag = article.find("p")
    description = desc_tag.text.strip() if desc_tag else "[no description]"

    titles.append(title)
    links.append(link)
    descriptions.append(description)

# Simpan ke DataFrame
df = pd.DataFrame({
    "title": titles,
    "link": links,
    "description": descriptions
})

df.to_csv("freecodecamp_data_science.csv", index=False)
df.head()


Unnamed: 0,title,link,description
0,Learn to Build a Multilayer Perceptron with Re...,https://www.freecodecamp.org/news/build-a-mult...,[no description]
1,Learn Python for Data Science – Full Course fo...,https://www.freecodecamp.org/news/learn-python...,[no description]
2,How to Extract YouTube Analytics Data and Ana...,https://www.freecodecamp.org/news/extract-yout...,[no description]
3,How to Create a DeepSeek R1 API in R with Plumber,https://www.freecodecamp.org/news/how-to-creat...,[no description]
4,Learn Clustering in Python – A Machine Learnin...,https://www.freecodecamp.org/news/clustering-i...,[no description]


**LAZY LOADING** (Konten dimuat saat scroll)

1.   requests + BeautifulSoup ➜ ❌ tidak bisa ambil semua
2.   Hanya 25 artikel yang muncul di HTML awal


In [31]:
print(len(articles))

25


In [23]:
df.head()

Unnamed: 0,title,link,description,clean_description
0,Learn to Build a Multilayer Perceptron with Re...,https://www.freecodecamp.org/news/build-a-mult...,[no description],description
1,Learn Python for Data Science – Full Course fo...,https://www.freecodecamp.org/news/learn-python...,[no description],description
2,How to Extract YouTube Analytics Data and Ana...,https://www.freecodecamp.org/news/extract-yout...,[no description],description
3,How to Create a DeepSeek R1 API in R with Plumber,https://www.freecodecamp.org/news/how-to-creat...,[no description],description
4,Learn Clustering in Python – A Machine Learnin...,https://www.freecodecamp.org/news/clustering-i...,[no description],description


In [28]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

# Fungsi preprocessing sederhana
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Hapus angka dan tanda baca
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

# Terapkan pada kolom deskripsi
df['clean_description'] = df['description'].fillna("").apply(preprocess)


In [29]:
df.to_csv("preprocessed_courses.csv", index=False)

In [30]:
from google.colab import files
files.download("preprocessed_courses.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF vector
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['title'])

# Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix)


In [6]:
def recommend_courses(title_input, top_n=5):
    try:
        idx = df[df['title'].str.contains(title_input, case=False, regex=False)].index[0]
    except IndexError:
        return "Kursus tidak ditemukan."

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommended = df.iloc[[i[0] for i in sim_scores]][['title', 'link', 'description']]
    return recommended


In [27]:
recommend_courses("data science")

Unnamed: 0,title,link,description
15,Applied Data Science with Python – Business In...,https://www.freecodecamp.org/news/applied-data...,[no description]
13,Learn Python for Data Science – Hands-on Proje...,https://www.freecodecamp.org/news/learn-python...,[no description]
10,How to Build a Quantum AI Model for Predicting...,https://www.freecodecamp.org/news/how-to-build...,[no description]
24,The Data Science and AI Handbook – How to Star...,https://www.freecodecamp.org/news/the-data-sci...,[no description]
4,Learn Clustering in Python – A Machine Learnin...,https://www.freecodecamp.org/news/clustering-i...,[no description]


In [19]:
recommend_courses("learning")

Unnamed: 0,title,link,description
21,Machine Learning Fundamentals Handbook – Key C...,https://www.freecodecamp.org/news/machine-lear...,[no description]
5,The AI Engineering Handbook – How to Start a C...,https://www.freecodecamp.org/news/the-ai-engin...,[no description]
24,The Data Science and AI Handbook – How to Star...,https://www.freecodecamp.org/news/the-data-sci...,[no description]
1,Learn Python for Data Science – Full Course fo...,https://www.freecodecamp.org/news/learn-python...,[no description]
2,How to Extract YouTube Analytics Data and Ana...,https://www.freecodecamp.org/news/extract-yout...,[no description]


In [22]:
recommend_courses("ai")

Unnamed: 0,title,link,description
24,The Data Science and AI Handbook – How to Star...,https://www.freecodecamp.org/news/the-data-sci...,[no description]
14,Practical Guide to Linear Algebra in Data Scie...,https://www.freecodecamp.org/news/linear-algeb...,[no description]
4,Learn Clustering in Python – A Machine Learnin...,https://www.freecodecamp.org/news/clustering-i...,[no description]
10,How to Build a Quantum AI Model for Predicting...,https://www.freecodecamp.org/news/how-to-build...,[no description]
23,How to Use Databricks Delta Lake with SQL – Fu...,https://www.freecodecamp.org/news/databricks-s...,[no description]
