In [4]:
!pip install selenium webdriver-manager



In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

options = Options()
# options.add_argument("--headless")  # bisa aktifkan jika tidak ingin melihat browser
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 15)

driver.get("https://www.coursera.org/courses?")
time.sleep(5)

all_courses = []
page = 1
max_pages = 84

while page <= max_pages:
    print(f"Scraping halaman {page}...")

    # Scroll ke bawah agar semua elemen muncul
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    # Ambil semua kursus yang tampil
    cards = driver.find_elements(By.CSS_SELECTOR, "li.cds-grid-item")
    for card in cards:
        try:
            title = card.find_element(By.CLASS_NAME, "cds-CommonCard-title").text
        except: title = "N/A"
        try:
            partner = card.find_element(By.CLASS_NAME, "cds-ProductCard-partnerNames").text
        except: partner = "N/A"
        try:
            desc = card.find_element(By.CLASS_NAME, "cds-ProductCard-body").text.replace("Skills you'll gain", "").strip()
        except: desc = "N/A"
        try:
            rating_element = card.find_element(By.CSS_SELECTOR, "div[aria-label='Rating'][role='meter']")
            rating = rating_element.get_attribute("aria-valuenow")
        except:
            rating = "N/A"
        try:
            reviews = [x.text for x in card.find_elements(By.CLASS_NAME, "css-vac8rf") if "review" in x.text.lower()]
            reviews = reviews[0] if reviews else "N/A"
        except: reviews = "N/A"
        try:
            meta = card.find_element(By.CLASS_NAME, "cds-CommonCard-metadata").text
        except: meta = "N/A"

        all_courses.append({
            "Page": page,
            "Title": title,
            "Partner": partner,
            "Skills": desc,
            "Rating": rating,
            "Reviews": reviews,
            "Metadata": meta
        })

    # Mencoba klik tombol halaman berikutnya
    page += 1
    try:
        # Tunggu tombol pagination muncul
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "button[data-testid='pagination-item']")))
        # Cari tombol yang sesuai page saat ini
        next_btn_xpath = f"//button[@data-testid='pagination-item' and span[text()='{page}']]"
        next_button = driver.find_element(By.XPATH, next_btn_xpath)

        # Scroll ke tombol dan klik dengan JS
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(3)
    except Exception as e:
        print(f"Gagal klik halaman {page} - {e}")
        break

driver.quit()

df = pd.DataFrame(all_courses)
df.to_csv("coursera_courses_all_pages.csv", index=False)
print("Selesai! Total kursus:", len(df))


Scraping halaman 1...
Scraping halaman 2...
Scraping halaman 3...
Scraping halaman 4...
Scraping halaman 5...
Scraping halaman 6...
Scraping halaman 7...
Scraping halaman 8...
Scraping halaman 9...
Scraping halaman 10...
Scraping halaman 11...
Scraping halaman 12...
Scraping halaman 13...
Scraping halaman 14...
Scraping halaman 15...
Scraping halaman 16...
Scraping halaman 17...
Scraping halaman 18...
Scraping halaman 19...
Scraping halaman 20...
Scraping halaman 21...
Scraping halaman 22...
Scraping halaman 23...
Scraping halaman 24...
Scraping halaman 25...
Scraping halaman 26...
Scraping halaman 27...
Scraping halaman 28...
Scraping halaman 29...
Scraping halaman 30...
Scraping halaman 31...
Scraping halaman 32...
Scraping halaman 33...
Scraping halaman 34...
Scraping halaman 35...
Scraping halaman 36...
Scraping halaman 37...
Scraping halaman 38...
Scraping halaman 39...
Scraping halaman 40...
Scraping halaman 41...
Scraping halaman 42...
Scraping halaman 43...
Scraping halaman 44.

In [6]:
import pandas as pd
import re
import string
df = pd.read_csv("coursera_courses_all_pages.csv")

In [7]:
df.head(), df.columns.tolist()

(   Page                                  Title Partner  \
 0     1                  Google Data Analytics  Google   
 1     1                   Google Cybersecurity  Google   
 2     1             Google Project Management:  Google   
 3     1                   Google AI Essentials  Google   
 4     1  Google Digital Marketing & E-commerce  Google   
 
                                               Skills  Rating       Reviews  \
 0  : Data Storytelling, Data Literacy, Data Visua...     4.8  171K reviews   
 1  : Threat Modeling, Network Security, Incident ...     4.8   54K reviews   
 2  : Quality Management, Project Management Life ...     4.8  131K reviews   
 3  : Prompt Engineering, Generative AI, Artificia...     4.8  3.6K reviews   
 4  : Data Storytelling, Search Engine Marketing, ...     4.8   41K reviews   
 
                                             Metadata  
 0  Beginner · Professional Certificate · 3 - 6 Mo...  
 1  Beginner · Professional Certificate · 3 - 6 Mo...  


In [8]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load dataset
df = pd.read_csv("coursera_courses_all_pages.csv")

# 2. Pisahkan kolom Metadata menjadi Level, Certificate Type, dan Duration
def split_metadata(meta):
    parts = str(meta).split("·")
    parts = [p.strip() for p in parts]
    level = parts[0] if len(parts) > 0 else None
    cert_type = parts[1] if len(parts) > 1 else None
    duration = parts[2] if len(parts) > 2 else None
    return pd.Series([level, cert_type, duration])

df[['Level', 'Certificate_Type', 'Duration']] = df['Metadata'].apply(split_metadata)

# 3. Definisikan pembersih teks
stopwords_custom = set([
    "the", "and", "to", "of", "in", "a", "with", "for", "on", "that", "is", "this", 
    "as", "are", "at", "an", "be", "or", "by", "from", "you", "your", "it", "will",
    "gain", "skills", "course", "learn"
])

def basic_clean(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = text.encode("ascii", "ignore").decode()  # remove emoji
    text = re.sub(r"[^\w\s]", "", text)  # Hapus tanda baca, biarkan angka
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords_custom]
    return " ".join(tokens)

def clean_skills(text):
    text = str(text).lower()
    text = text.replace("skills you'll gain:", "")
    text = text.replace("skills you’ll gain:", "")
    text = text.replace("obile", "mobile")  # typo correction
    return basic_clean(text)

# 4. Terapkan pembersihan
df["Title_clean"] = df["Title"].apply(basic_clean)
df["Metadata_clean"] = df["Metadata"].apply(basic_clean)
df["Skills_clean"] = df["Skills"].apply(clean_skills)

# 5. TF-IDF Vectorization
tfidf_title = TfidfVectorizer(max_features=100)
tfidf_skills = TfidfVectorizer(max_features=100)
tfidf_meta = TfidfVectorizer(max_features=50)

tfidf_title_matrix = tfidf_title.fit_transform(df["Title_clean"]).toarray()
tfidf_skills_matrix = tfidf_skills.fit_transform(df["Skills_clean"]).toarray()
tfidf_meta_matrix = tfidf_meta.fit_transform(df["Metadata_clean"]).toarray()

# 6. Gabungkan fitur dan hitung similarity
features_matrix = np.hstack((tfidf_title_matrix, tfidf_skills_matrix, tfidf_meta_matrix))
similarity_matrix = cosine_similarity(features_matrix)

# 7. Simpan hasil
df["Course_Index"] = df.index
df.to_csv("final_courses_with_cleaning.csv", index=False)
pd.DataFrame(similarity_matrix).to_csv("course_similarity_matrix.csv", index=False)


In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv("final_courses_with_cleaning.csv")

# Bersihkan NaN
df["Title_clean"] = df["Title_clean"].fillna("")
df["Skills_clean"] = df["Skills_clean"].fillna("")
df["Metadata_clean"] = df["Metadata_clean"].fillna("")

# Gabungkan fitur teks
df["combined_features"] = df["Title_clean"] + " " + df["Skills_clean"] 

# TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["combined_features"])

# Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Fungsi parsing jumlah review
def parse_reviews(value):
    try:
        value = str(value).strip().replace(",", "")
        if 'K' in value:
            return float(value.replace('K', '')) * 1_000
        elif 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
        else:
            return float(value)
    except:
        return 0.0

# Bersihkan review dan rating
df["num_reviews"] = df["Reviews"].apply(parse_reviews)
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce").fillna(0)

# Fungsi hybrid recommendation
def hybrid_recommendations(keyword, top_n=10):
    keyword_vec = tfidf.transform([keyword])
    sim_scores = cosine_similarity(keyword_vec, tfidf_matrix).flatten()
    df["weighted_score"] = sim_scores * (df["Rating"] * np.log1p(df["num_reviews"] + 1e-5))
    top_indices = df["weighted_score"].nlargest(top_n).index
    return df.loc[top_indices, ["Title", "Rating", "Reviews", "Level", "Duration", "weighted_score"]]

# Contoh penggunaan
results = hybrid_recommendations("data science python sql", top_n=10)
print(results)


                                              Title  Rating  \
23    Data Science Fundamentals with Python and SQL     4.6   
593  Databases and SQL for Data Science with Python     4.7   
532                            SQL for Data Science     4.6   
668                 Python Project for Data Science     4.5   
26                             Applied Data Science     4.6   
27                     Introduction to Data Science     4.6   
259                               Meta Data Analyst     4.7   
191               Learn SQL Basics for Data Science     4.6   
495                          Python Data Structures     4.9   
551                         Get Started with Python     4.8   

                                               Reviews         Level  \
23                                         72K reviews      Beginner   
593                                        22K reviews      Beginner   
532                                        17K reviews      Beginner   
668               

In [11]:
hybrid_recommendations("data science python sql", top_n=1000)

Unnamed: 0,Title,Rating,Reviews,Level,Duration,weighted_score
23,Data Science Fundamentals with Python and SQL,4.6,72K reviews,Beginner,3 - 6 Months,0.000026
593,Databases and SQL for Data Science with Python,4.7,22K reviews,Beginner,1 - 3 Months,0.000026
532,SQL for Data Science,4.6,17K reviews,Beginner,1 - 4 Weeks,0.000025
668,Python Project for Data Science,4.5,4.7K reviews,Intermediate,1 - 4 Weeks,0.000025
26,Applied Data Science,4.6,59K reviews,Beginner,3 - 6 Months,0.000019
...,...,...,...,...,...,...
991,Suicide Prevention,4.7,301 reviews,Beginner,1 - 4 Weeks,0.000000
992,Music Business,4.8,3.3K reviews,Beginner,3 - 6 Months,0.000000
993,Introduction to Public Relations and the Media,4.6,690 reviews,Beginner,3 - 6 Months,0.000000
994,Product Owner Certification [PSPO I] Preparation,4.6,89 reviews,Beginner,3 - 6 Months,0.000000


In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv("final_courses_with_cleaning.csv")

# Bersihkan NaN
df["Title_clean"] = df["Title_clean"].fillna("")
df["Skills_clean"] = df["Skills_clean"].fillna("")
df["Metadata_clean"] = df["Metadata_clean"].fillna("")

# Gabungkan fitur teks
df["combined_features"] = df["Title_clean"] + " " + df["Skills_clean"] 

# TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["combined_features"])

# Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Fungsi parsing jumlah review dan durasi
def parse_duration(duration_str):
    try:
        duration_str = str(duration_str).lower()
        if "hour" in duration_str:
            return 0.25  # kurang dari 1 minggu
        elif "day" in duration_str:
            return 0.5
        elif "week" in duration_str:
            num = int(duration_str.split()[0])
            return num
        elif "month" in duration_str:
            num = int(duration_str.split()[0])
            return num * 4  # 1 bulan = 4 minggu
        else:
            return 999  # jika tidak diketahui
    except:
        return 999

def parse_reviews(value):
    try:
        value = str(value).strip().replace(",", "")
        if 'K' in value:
            return float(value.replace('K', '')) * 1_000
        elif 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
        else:
            return float(value)
    except:
        return 0.0

# Bersihkan review dan rating
df["num_reviews"] = df["Reviews"].apply(parse_reviews)
df["duration_weeks"] = df["Duration"].apply(parse_duration)
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce").fillna(0)

# Fungsi hybrid recommendation
def hybrid_recommendations_filtered(keyword, level_filter="Beginner", max_duration_weeks=4, top_n=10):
    keyword_vec = tfidf.transform([keyword])
    sim_scores = cosine_similarity(keyword_vec, tfidf_matrix).flatten()

    # Tambah weighted score
    df["weighted_score"] = sim_scores * (df["Rating"] * np.log1p(df["num_reviews"] + 1e-5))

    # Filter berdasarkan level dan durasi
    filtered_df = df[
        df["Level"].str.lower().str.contains(level_filter.lower()) &
        (df["duration_weeks"] <= max_duration_weeks)
    ]

    # Ambil top N dari yang sudah difilter
    top_indices = filtered_df["weighted_score"].nlargest(top_n).index
    return filtered_df.loc[top_indices, ["Title", "Rating", "Reviews", "Level", "Duration", "weighted_score"]]

results = hybrid_recommendations_filtered(
    keyword="machine learning python",
    level_filter="Beginner",
    max_duration_weeks=4,  # maks. 1 bulan
    top_n=10
)
print(results)


                                                 Title  Rating       Reviews  \
7                                     Machine Learning     4.9   35K reviews   
222  Unsupervised Learning, Recommenders, Reinforce...     4.9  4.9K reviews   
50                                  Introduction to AI     4.9    2K reviews   
515    Fundamentals of Machine Learning for Healthcare     4.8   569 reviews   
372              Structuring Machine Learning Projects     4.8   50K reviews   
42   Supervised Machine Learning: Regression and Cl...     4.9   29K reviews   
133                       Advanced Learning Algorithms     4.9    8K reviews   
647            Introduction to Artificial Intelligence     4.6    21 reviews   
540               AI For Everyone （すべての人のためのAIリテラシー講座）     4.7   288 reviews   
825   Mathematics for Machine Learning: Linear Algebra     4.7   12K reviews   

        Level      Duration  weighted_score  
7    Beginner  1 - 3 Months        0.000030  
222  Beginner   1 - 4 Weeks

In [17]:
hybrid_recommendations_filtered(
    keyword="data analyst",
    level_filter="advance",
    max_duration_weeks=18,  
    top_n=100
)

Unnamed: 0,Title,Rating,Reviews,Level,Duration,weighted_score
831,Go Beyond the Numbers: Translate Data into Ins...,4.7,813 reviews,Advanced,1 - 3 Months,1.269873e-05
846,The Path to Insights: Data Models and Pipelines,4.7,650 reviews,Advanced,1 - 4 Weeks,1.052772e-05
10,Google Advanced Data Analytics,4.7,7.1K reviews,Advanced,3 - 6 Months,8.954131e-06
405,Foundations of Data Science,4.7,3.3K reviews,Advanced,1 - 3 Months,8.230509e-06
17,Google Business Intelligence,4.8,3.7K reviews,Advanced,3 - 6 Months,8.117186e-06
476,Foundations of Business Intelligence,4.8,1.9K reviews,Advanced,1 - 4 Weeks,6.957607e-06
608,MLOps | Machine Learning Operations,4.2,480 reviews,Advanced,3 - 6 Months,6.260846e-06
551,Get Started with Python,4.8,1.6K reviews,Advanced,1 - 3 Months,5.647331e-06
153,IBM RAG and Agentic AI,4.5,68 reviews,Advanced,3 - 6 Months,2.006413e-06
678,Foundations of Data Structures and Algorithms,4.6,838 reviews,Advanced,1 - 3 Months,1.957718e-06


In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv("final_courses_with_cleaning.csv")

# Bersihkan kolom teks
df["Title_clean"] = df["Title_clean"].fillna("")
df["Skills_clean"] = df["Skills_clean"].fillna("")
df["Metadata_clean"] = df["Metadata_clean"].fillna("")

# Gabungkan fitur teks untuk TF-IDF
df["combined_features"] = df["Title_clean"] + " " + df["Skills_clean"] 

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["combined_features"])

# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Fungsi parsing durasi (dalam minggu)
def parse_duration(duration_str):
    try:
        duration_str = str(duration_str).lower()
        if "hour" in duration_str:
            return 0.25
        elif "day" in duration_str:
            return 0.5
        elif "week" in duration_str:
            return int(duration_str.split()[0])
        elif "month" in duration_str:
            return int(duration_str.split()[0]) * 4
        else:
            return 999
    except:
        return 999

# Fungsi parsing review
def parse_reviews(value):
    try:
        value = str(value).strip().replace(",", "")
        if 'K' in value:
            return float(value.replace('K', '')) * 1_000
        elif 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
        else:
            return float(value)
    except:
        return 0.0

# Bersihkan kolom numerik
df["num_reviews"] = df["Reviews"].apply(parse_reviews)
df["duration_weeks"] = df["Duration"].apply(parse_duration)
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce").fillna(0)

# Fungsi rekomendasi hybrid + filter
def hybrid_recommendations_filtered(
    keyword,
    level_filter="Beginner",
    max_duration_weeks=4,
    certificate_filter="Professional Certificate",
    top_n=10
):
    # Hitung similarity terhadap keyword
    keyword_vec = tfidf.transform([keyword])
    sim_scores = cosine_similarity(keyword_vec, tfidf_matrix).flatten()

    # Skor gabungan
    df["weighted_score"] = sim_scores * (df["Rating"] * np.log1p(df["num_reviews"] + 1e-5))

    # Filtering
    filtered_df = df[
        df["Level"].str.lower().str.contains(level_filter.lower()) &
        (df["duration_weeks"] <= max_duration_weeks) &
        df["Certificate_Type"].str.lower().str.contains(certificate_filter.lower())
    ]

    # Top N hasil
    top_indices = filtered_df["weighted_score"].nlargest(top_n).index
    return filtered_df.loc[top_indices, [
        "Title", "Rating", "Reviews", "Level", "Duration", "Certificate_Type", "weighted_score"
    ]]

# Contoh penggunaan
results = hybrid_recommendations_filtered(
    keyword="python data science",
    level_filter="Beginner",
    max_duration_weeks=4,
    certificate_filter="Professional Certificate",
    top_n=10
)

print(results)


                                           Title  Rating  \
797                   Snowflake Data Engineering     4.8   
852                      Snowflake Generative AI     4.8   
383  Google Professional Workspace Administrator     4.7   
441               AWS Generative AI Applications     4.7   
142       Generative AI for Software Development     4.6   

                                               Reviews     Level  \
797                                        194 reviews  Beginner   
852                                        122 reviews  Beginner   
383                                       1.5K reviews  Beginner   
441                                         43 reviews  Beginner   
142  Skills you'll gain: Prompt Engineering, Large ...  Beginner   

         Duration          Certificate_Type  weighted_score  
797  1 - 3 Months  Professional Certificate        0.000012  
852  1 - 3 Months  Professional Certificate        0.000007  
383   1 - 4 Weeks  Professional Certificate 

In [19]:
hybrid_recommendations_filtered(
    keyword="python",
    level_filter="beginner",
    max_duration_weeks=12,
    certificate_filter="specialization",
    top_n=20
)


Unnamed: 0,Title,Rating,Reviews,Level,Duration,Certificate_Type,weighted_score
23,Data Science Fundamentals with Python and SQL,4.6,72K reviews,Beginner,3 - 6 Months,Specialization,1.6e-05
183,Introduction to Programming with Python and Java,4.5,1.8K reviews,Beginner,3 - 6 Months,Specialization,1.3e-05
19,Data Science Foundations,4.6,114K reviews,Beginner,3 - 6 Months,Specialization,1.2e-05
310,Python 3 Programming,4.8,23K reviews,Beginner,3 - 6 Months,Specialization,1.1e-05
41,Python for Everybody,4.8,278K reviews,Beginner,3 - 6 Months,Specialization,1.1e-05
26,Applied Data Science,4.6,59K reviews,Beginner,3 - 6 Months,Specialization,1e-05
863,Statistics with Python,4.6,3.3K reviews,Beginner,1 - 3 Months,Specialization,9e-06
27,Introduction to Data Science,4.6,"Skills you'll gain: SQL, Jupyter, Data Literac...",Beginner,3 - 6 Months,Specialization,6e-06
268,Mathematics for Machine Learning,4.6,15K reviews,Beginner,3 - 6 Months,Specialization,6e-06
38,Data Engineering Foundations,4.6,57K reviews,Beginner,3 - 6 Months,Specialization,5e-06


In [20]:
keywords = [
    "python data science",
    "machine learning",
    "deep learning",
    "excel",
    "project management",
    "cloud computing",
    "cyber security",
    "digital marketing",
    "web development",
    "artificial intelligence"
]

In [24]:
print("Certificate_Type:")
print(df["Certificate_Type"].dropna().unique())

Certificate_Type:
['Professional Certificate' 'Specialization' 'Course']


In [25]:
print("\nLevel:")
print(df["Level"].dropna().unique())


Level:
['Beginner' 'Advanced' 'Intermediate' 'Mixed']


In [29]:
print(results_df.isna().sum())

Title               0
Rating              0
Reviews             0
Level               0
Duration            0
Certificate_Type    0
weighted_score      0
dtype: int64


In [33]:
import json
import pandas as pd
import numpy as np

# List keyword untuk pencarian
keywords = [
    "python data science",
    "machine learning",
    "deep learning",
    "excel",
    "project management",
    "cloud computing",
    "cyber security",
    "digital marketing",
    "web development",
    "artificial intelligence"
]

# Kemungkinan kombinasi level dan sertifikat
levels = ["Beginner", "Intermediate", "Advanced"]
certificates = ["Course", "Specialization", "Professional Certificate"]

# Dictionary utama untuk simpan semua hasil
all_results = {}

# Loop semua kombinasi
for level in levels:
    for cert in certificates:
        combination_key = f"{level}_{cert}"
        all_results[combination_key] = {}

        for keyword in keywords:
            # Panggil fungsi rekomendasi (pastikan fungsi ini sudah ada di import atau definisi)
            results_df = hybrid_recommendations_filtered(
                keyword=keyword,
                level_filter=level,
                max_duration_weeks=36,
                certificate_filter=cert,
                top_n=1000
            )

            # Pastikan tidak ada NaN: konversi ke None agar JSON bisa menyimpan
            results_df = results_df.replace({np.nan: None})

            # Konversi ke format JSON-friendly
            recommendation_list = results_df.to_dict(orient="records")

            # Simpan ke dictionary dengan struktur: Level_Cert > Keyword > List rekomendasi
            all_results[combination_key][keyword] = recommendation_list

print("Semua kombinasi berhasil dihitung. Menyimpan ke file...")

# Simpan ke file JSON
with open("recommendations_all_combinations.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print("✅ Semua rekomendasi berhasil disimpan ke 'recommendations_all_combinations.json'")

Semua kombinasi berhasil dihitung. Menyimpan ke file...
✅ Semua rekomendasi berhasil disimpan ke 'recommendations_all_combinations.json'


In [35]:
# Buat TF-IDF matrix
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["combined_features"])

# Konversi TF-IDF matrix ke DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

# (Opsional) Tambahkan info kursus ke DataFrame
tfidf_df.insert(0, "Course_Title", df["Title"].values)

# Simpan ke CSV
tfidf_df.to_csv("tfidf_combined_features.csv", index=False)

print("TF-IDF berhasil disimpan ke 'tfidf_combined_features.csv'")

TF-IDF berhasil disimpan ke 'tfidf_combined_features.csv'
