In [1]:
from pathlib import Path

import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [2]:
BASE_DIR = Path(r"C:\Users\user\OneDrive\Desktop\Projects\ai-study-pal")
DATA_DIR = BASE_DIR / "data"
RES_DIR = DATA_DIR / "resources"
MODELS_DIR = BASE_DIR / "backend" / "models"

CSV_PATH = RES_DIR / "resources.csv"
VEC_PATH = MODELS_DIR / "resource_tfidf_vectorizer.joblib"
KMEANS_PATH = MODELS_DIR / "resource_kmeans.joblib"
RES_TBL_PATH = RES_DIR / "resources.parquet"

RES_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

CSV_PATH, VEC_PATH, KMEANS_PATH, RES_TBL_PATH


(WindowsPath('C:/Users/user/OneDrive/Desktop/Projects/ai-study-pal/data/resources/resources.csv'),
 WindowsPath('C:/Users/user/OneDrive/Desktop/Projects/ai-study-pal/backend/models/resource_tfidf_vectorizer.joblib'),
 WindowsPath('C:/Users/user/OneDrive/Desktop/Projects/ai-study-pal/backend/models/resource_kmeans.joblib'),
 WindowsPath('C:/Users/user/OneDrive/Desktop/Projects/ai-study-pal/data/resources/resources.parquet'))

In [3]:
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,subject,title,url,description
0,data structures,GeeksforGeeks DSA Playlist,https://www.youtube.com/playlist?list=PLqM7alH...,"Intro to arrays, linked lists, stacks, queues."
1,algorithms,CLRS Algorithms Notes,https://web.mit.edu/~16.070/www/lecture/big_o.pdf,"Big-O, sorting, searching, basic algorithm des..."
2,python,Python Official Tutorial,https://docs.python.org/3/tutorial/,"Basics of Python syntax, control flow, functions."
3,machine learning,Andrew Ng ML Course,https://www.coursera.org/learn/machine-learning,"Supervised learning, regression, classificatio..."
4,deep learning,DeepLearning.ai Course,https://www.coursera.org/specializations/deep-...,"Neural networks, backpropagation, CNNs."


In [4]:
df["subject"] = df["subject"].fillna("")
df["description"] = df["description"].fillna("")
df["text"] = df["subject"] + " " + df["description"]
df[["subject", "title", "text"]].head()

Unnamed: 0,subject,title,text
0,data structures,GeeksforGeeks DSA Playlist,"data structures Intro to arrays, linked lists,..."
1,algorithms,CLRS Algorithms Notes,"algorithms Big-O, sorting, searching, basic al..."
2,python,Python Official Tutorial,"python Basics of Python syntax, control flow, ..."
3,machine learning,Andrew Ng ML Course,"machine learning Supervised learning, regressi..."
4,deep learning,DeepLearning.ai Course,"deep learning Neural networks, backpropagation..."


In [5]:
vectorizer = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df["text"])
X.shape


(10, 113)

In [6]:
k = min(5, X.shape[0])  # up to 5 clusters, but not more than rows
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X)

df["cluster"] = kmeans.labels_
df[["subject", "title", "cluster"]].head()


Unnamed: 0,subject,title,cluster
0,data structures,GeeksforGeeks DSA Playlist,3
1,algorithms,CLRS Algorithms Notes,1
2,python,Python Official Tutorial,0
3,machine learning,Andrew Ng ML Course,2
4,deep learning,DeepLearning.ai Course,2


In [10]:
joblib.dump(vectorizer, VEC_PATH)
joblib.dump(kmeans, KMEANS_PATH)

# save as CSV instead of parquet
df.to_csv(RES_DIR / "resources_with_clusters.csv", index=False)

print("Saved:")
print("Vectorizer:", VEC_PATH)
print("KMeans:", KMEANS_PATH)
print("Table:", RES_DIR / "resources_with_clusters.csv")


Saved:
Vectorizer: C:\Users\user\OneDrive\Desktop\Projects\ai-study-pal\backend\models\resource_tfidf_vectorizer.joblib
KMeans: C:\Users\user\OneDrive\Desktop\Projects\ai-study-pal\backend\models\resource_kmeans.joblib
Table: C:\Users\user\OneDrive\Desktop\Projects\ai-study-pal\data\resources\resources_with_clusters.csv


In [None]:
test_subject = "data structures"
Xq = vectorizer.transform([test_subject])
cluster = int(kmeans.predict(Xq)[0])
print("Test subject:", test_subject)
print("Predicted cluster:", cluster)

df[df["cluster"] == cluster][["subject", "title"]].head()
