In [1]:
import pandas as pd

In [2]:
courses = pd.read_csv("clean_courses.csv")
courses

Unnamed: 0.1,Unnamed: 0,course_name,description,url,clean_text
0,0,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,https://www.classcentral.com/course/computer-s...,introduction intellectual enterprise computer ...
1,1,Harvard CS50 – Full Computer Science Universit...,Learn the basics of computer science from Harv...,https://www.classcentral.com/classroom/freecod...,learn basic computer science harvard universit...
2,2,Introduction to Computer Science and Programmi...,An introduction to computer science as a tool ...,https://www.classcentral.com/course/computer-s...,introduction computer science tool solve real ...
3,3,Introduction to Electrical Engineering and Com...,This course provides an integrated introductio...,https://www.classcentral.com/classroom/mit-ocw...,course provide integrated introduction electri...
4,4,CS50's Computer Science for Business Professio...,This is CS50’s introduction to computer scienc...,https://www.classcentral.com/course/computer-s...,introduction computer science business profess...
...,...,...,...,...,...
5363,6455,Programming Languages,This course is an introduction to the fundamen...,https://www.udacity.com//course/programming-la...,course introduction fundamental programming la...
5364,6468,advancedAndroid with Kotlin,Develop Feature-Rich Android Apps with the Kot...,https://www.udacity.com//course/advanced-andro...,develop feature rich android apps kotlin progr...
5365,6480,Intro to DevOps,"Learn all about the DevOps landscape, workflow...",https://www.udacity.com//course/intro-to-devop...,learn devops landscape workflow survey system ...
5366,6481,Google Maps APIs,This course will introduce you to the content ...,https://www.udacity.com//course/google-maps-ap...,course introduce content available google maps...


In [3]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   5368 non-null   int64 
 1   course_name  5368 non-null   object
 2   description  5368 non-null   object
 3   url          5368 non-null   object
 4   clean_text   5354 non-null   object
dtypes: int64(1), object(4)
memory usage: 209.8+ KB


In [4]:
courses = courses.dropna(subset=['clean_text'])
courses = courses.reset_index(drop=True)
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5354 entries, 0 to 5353
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   5354 non-null   int64 
 1   course_name  5354 non-null   object
 2   description  5354 non-null   object
 3   url          5354 non-null   object
 4   clean_text   5354 non-null   object
dtypes: int64(1), object(4)
memory usage: 209.3+ KB


In [5]:
import pickle
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(courses['clean_text'])
cosine_sim = cosine_similarity(tfidf_matrix)

nlp = spacy.load("en_core_web_md")

# Convert each course description into a vector
courses['spacy_vector'] = courses['clean_text'].apply(lambda x: nlp(str(x)).vector)

# Stack them into a matrix for similarity search
spacy_matrix = np.vstack(courses['spacy_vector'].values)

tfidf_matrix_norm = normalize(tfidf_matrix)  
spacy_matrix_norm = normalize(spacy_matrix)


# Compute similarity separately
tfidf_sim = cosine_similarity(tfidf_matrix_norm, tfidf_matrix_norm)
spacy_sim = cosine_similarity(spacy_matrix_norm, spacy_matrix_norm)

alpha = 0.6
beta = 0.4
hybrid_sim = alpha * tfidf_sim + beta * spacy_sim

similarity_matrix = hybrid_sim


In [6]:
def recommend_by_text(query, top_n=5):
    # TF-IDF part
    query_tfidf = vectorizer.transform([query])
    tfidf_sim = cosine_similarity(query_tfidf, tfidf_matrix_norm)

    # spaCy part
    query_vec = nlp(query).vector.reshape(1, -1)
    spacy_sim = cosine_similarity(query_vec, spacy_matrix_norm)

    # Hybrid
    alpha, beta = 0.6, 0.4
    hybrid_scores = alpha * tfidf_sim + beta * spacy_sim

    # Get top matches
    top_idx = hybrid_scores.argsort()[0][-top_n:][::-1]
    return courses.iloc[top_idx][['course_name', 'description', 'url']]

In [14]:
recommend_by_text("i want to learn about app developement", top_n=5)

Unnamed: 0,course_name,description,url
3056,Intro to iOS App Development with Swift,Make Your First iPhone App,https://www.udacity.com/course/intro-to-ios-ap...
2113,Building No-Code Apps with AppSheet: Foundations,In this course you will learn the fundamentals...,https://www.coursera.org/learn/building-no-cod...
2539,"Ready, Set, Future! Introduction to Futures Th...",Do you want to think about the future with mor...,https://www.coursera.org/learn/introduction-to...
3092,Firebase in a Weekend: Android,Building a Realtime Chat App,https://www.udacity.com/course/firebase-in-a-w...
2893,Building No-Code Apps with AppSheet: Implement...,This course teaches you how to implement vario...,https://www.coursera.org/learn/building-no-cod...


In [8]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Initialize model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode texts
embeddings = sbert_model.encode(
    courses['clean_text'].tolist(),
    convert_to_numpy=True
)

sbert_embeddings_norm = normalize(embeddings)

In [9]:
def recommend(course_name, top_n=5):
    idx = courses[courses['course_name'] == course_name].index[0]
    scores = util.cos_sim(embeddings[idx], embeddings)[0]
    top_indices = np.argsort(-scores)[1:top_n+1]
    return courses.iloc[top_indices][['course_name', 'description', 'url']]

In [10]:
recommend("Data Analyst", top_n=5)

Unnamed: 0,course_name,description,url
1305,Data Analysis with R Programming,This course is the seventh course in the Googl...,https://www.coursera.org/learn/data-analysis-r
1257,Prepare Data for Exploration,This is the third course in the Google Data An...,https://www.coursera.org/learn/data-preparation
5226,Data Science with R Certification Course,Simplilearn’s Data Science with R certificatio...,https://www.simplilearn.com/big-data-and-analy...
1936,Analyzing and Visualizing Data in Looker,"In this course, you learn how to do the kind o...",https://www.coursera.org/learn/analyzing-and-v...
1270,Foundations of Data Science,This is the first of seven courses in the Goog...,https://www.coursera.org/learn/foundations-of-...


In [11]:
zeros_rows = courses[courses['spacy_vector'].apply(lambda v: np.all(np.array(v) == 0.0))]['course_name'].count()
print(zeros_rows)

22


In [12]:
courses.head()

Unnamed: 0.1,Unnamed: 0,course_name,description,url,clean_text,spacy_vector
0,0,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,https://www.classcentral.com/course/computer-s...,introduction intellectual enterprise computer ...,"[-0.74563706, 0.21714112, 0.07900714, -0.10601..."
1,1,Harvard CS50 – Full Computer Science Universit...,Learn the basics of computer science from Harv...,https://www.classcentral.com/classroom/freecod...,learn basic computer science harvard universit...,"[-0.73138386, 0.26928508, 0.04624372, -0.18004..."
2,2,Introduction to Computer Science and Programmi...,An introduction to computer science as a tool ...,https://www.classcentral.com/course/computer-s...,introduction computer science tool solve real ...,"[-0.6807582, 0.3142632, -0.09913699, -0.107650..."
3,3,Introduction to Electrical Engineering and Com...,This course provides an integrated introductio...,https://www.classcentral.com/classroom/mit-ocw...,course provide integrated introduction electri...,"[-0.72200805, 0.23780149, -0.062220402, -0.111..."
4,4,CS50's Computer Science for Business Professio...,This is CS50’s introduction to computer scienc...,https://www.classcentral.com/course/computer-s...,introduction computer science business profess...,"[-0.6797, 0.22705002, 0.012623596, 0.0211236, ..."


In [13]:
# Save models
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

np.save("hybrid_similarity.npy", hybrid_sim)


courses.to_pickle("courses_with_vectors.pkl")
np.save("sbert_embeddings.npy", sbert_embeddings_norm)