In [1]:
import pandas as pd

In [2]:
courses = pd.read_csv("clean_courses.csv")
courses

Unnamed: 0.1,Unnamed: 0,course_name,description,url,clean_text
0,0,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,https://www.classcentral.com/course/computer-s...,introduction intellectual enterprise computer ...
1,1,Harvard CS50 – Full Computer Science Universit...,Learn the basics of computer science from Harv...,https://www.classcentral.com/classroom/freecod...,learn basic computer science harvard universit...
2,2,Introduction to Computer Science and Programmi...,An introduction to computer science as a tool ...,https://www.classcentral.com/course/computer-s...,introduction computer science tool solve real ...
3,3,Introduction to Electrical Engineering and Com...,This course provides an integrated introductio...,https://www.classcentral.com/classroom/mit-ocw...,course provide integrated introduction electri...
4,4,CS50's Computer Science for Business Professio...,This is CS50’s introduction to computer scienc...,https://www.classcentral.com/course/computer-s...,introduction computer science business profess...
...,...,...,...,...,...
5674,5674,Front-End Interview Prep,Answer front-end technical and behavioral inte...,https://www.udacity.com//course/front-end-inte...,answer end technical behavioral interview ques...
5675,5675,Full-Stack Interview Prep,Answer common full stack and web security inte...,https://www.udacity.com//course/full-stack-int...,answer common stack web security interview que...
5676,5676,Data Structures & Algorithms in Swift,Review and practice the skills technical inter...,https://www.udacity.com//course/data-structure...,review practice skill technical interviewer ex...
5677,5677,iOS Interview Prep,Answer iOS and mobile development interview qu...,https://www.udacity.com//course/ios-interview-...,answer ios mobile development interview questi...


In [3]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5679 entries, 0 to 5678
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   5679 non-null   int64 
 1   course_name  5679 non-null   object
 2   description  5679 non-null   object
 3   url          5679 non-null   object
 4   clean_text   5679 non-null   object
dtypes: int64(1), object(4)
memory usage: 222.0+ KB


In [4]:
courses = courses.dropna(subset=['clean_text'])
courses = courses.reset_index(drop=True)
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5679 entries, 0 to 5678
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   5679 non-null   int64 
 1   course_name  5679 non-null   object
 2   description  5679 non-null   object
 3   url          5679 non-null   object
 4   clean_text   5679 non-null   object
dtypes: int64(1), object(4)
memory usage: 222.0+ KB


In [6]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(courses['clean_text'])
cosine_sim = cosine_similarity(tfidf_matrix)


# Convert each course description into a vector
courses['spacy_vector'] = courses['clean_text'].apply(lambda x: nlp(str(x)).vector)

tfidf_matrix_norm = normalize(tfidf_matrix)  

# Compute similarity separately
tfidf_sim = cosine_similarity(tfidf_matrix_norm, tfidf_matrix_norm)





In [7]:
def recommend_by_text(query, top_n=5):
    # TF-IDF part
    query_tfidf = vectorizer.transform([query])
    tfidf_sim = cosine_similarity(query_tfidf, tfidf_matrix_norm)

    hybrid_scores = tfidf_sim

    # Get top matches
    top_idx = hybrid_scores.argsort()[0][-top_n:][::-1]
    return courses.iloc[top_idx][['course_name', 'description', 'url']]

In [8]:
recommend_by_text("i want to learn about app developement", top_n=5)

Unnamed: 0,course_name,description,url
3123,Intro to iOS App Development with Swift,Make Your First iPhone App,https://www.udacity.com/course/intro-to-ios-ap...
2135,Building No-Code Apps with AppSheet: Foundations,In this course you will learn the fundamentals...,https://www.coursera.org/learn/building-no-cod...
3140,Firebase Analytics: iOS,Improve your apps with better user data,https://www.udacity.com/course/firebase-analyt...
3138,iOS Persistence and Core Data,Learn how to store data between app launches,https://www.udacity.com/course/ios-persistence...
5661,How to Make an iOS App,"Learn the process of building an app, taking y...",https://www.udacity.com//course/how-to-make-an...


In [9]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Initialize model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode texts
embeddings = sbert_model.encode(
    courses['clean_text'].tolist(),
    convert_to_numpy=True
)

sbert_embeddings_norm = normalize(embeddings)

In [10]:
def recommend(course_name, top_n=5):
    idx = courses[courses['course_name'] == course_name].index[0]
    scores = util.cos_sim(embeddings[idx], embeddings)[0]
    top_indices = np.argsort(-scores)[1:top_n+1]
    return courses.iloc[top_indices][['course_name', 'description', 'url']]

In [11]:
recommend("Data Analyst", top_n=5)

Unnamed: 0,course_name,description,url
3255,How to Become a Data Analyst,"Use Python, SQL, and statistics to uncover ins...",https://www.udacity.com/course/data-analyst-na...
5551,Intro to Data Analysis,"Explore a variety of datasets, posing and answ...",https://www.udacity.com//course/intro-to-data-...
3526,Introduction to Data Analytics with Python,Learn the fundamentals of using Python for dat...,https://www.futurelearn.com/courses/introducti...
3962,Microsoft Future Ready: Introduction to Python...,_x000D_\nUncover how to use Python programming...,https://www.futurelearn.com/courses/introducti...
3101,Intro to Data Analysis,Data Analysis Using NumPy & Pandas,https://www.udacity.com/course/intro-to-data-a...


In [12]:
zeros_rows = courses[courses['spacy_vector'].apply(lambda v: np.all(np.array(v) == 0.0))]['course_name'].count()
print(zeros_rows)

9


In [13]:
courses.head()

Unnamed: 0.1,Unnamed: 0,course_name,description,url,clean_text,spacy_vector
0,0,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,https://www.classcentral.com/course/computer-s...,introduction intellectual enterprise computer ...,"[-0.74563706, 0.21714112, 0.07900714, -0.10601..."
1,1,Harvard CS50 – Full Computer Science Universit...,Learn the basics of computer science from Harv...,https://www.classcentral.com/classroom/freecod...,learn basic computer science harvard universit...,"[-0.73138386, 0.26928508, 0.04624372, -0.18004..."
2,2,Introduction to Computer Science and Programmi...,An introduction to computer science as a tool ...,https://www.classcentral.com/course/computer-s...,introduction computer science tool solve real ...,"[-0.6807582, 0.3142632, -0.09913699, -0.107650..."
3,3,Introduction to Electrical Engineering and Com...,This course provides an integrated introductio...,https://www.classcentral.com/classroom/mit-ocw...,course provide integrated introduction electri...,"[-0.72200805, 0.23780149, -0.062220402, -0.111..."
4,4,CS50's Computer Science for Business Professio...,This is CS50’s introduction to computer scienc...,https://www.classcentral.com/course/computer-s...,introduction computer science business profess...,"[-0.6797, 0.22705002, 0.012623596, 0.0211236, ..."


In [14]:
# Save models
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

courses.to_pickle("courses_with_vectors.pkl")
np.save("sbert_embeddings.npy", sbert_embeddings_norm)