#### Course Recommendation System

#### Algo
+ Cosine Similarity
+ Linear Similarity

#### Workflow
+ Dataset
+ Vectorize our dataset
+ Cosine Similarity Matrix
+ ID, Score
+ Recommend

In [52]:
import pandas as pd
import neattext.functions as nfx

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [54]:
df = pd.read_csv("dataset.csv")

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,subject
0,1,614832,Discrete Mathematics,https://www.udemy.com/course/discrete-math/,True,3099,22437,3219,124,All Levels,19 hours,Discrete Mathematics
1,2,1528598,"Master Discrete Mathematics: Sets, Math Logic,...",https://www.udemy.com/course/master-discrete-m...,True,2499,9114,1398,46,All Levels,7 hours,Discrete Mathematics
2,3,1240674,"Discrete Mathematics: Beginner's Complete, Mat...",https://www.udemy.com/course/master-discrete-m...,True,2299,6303,152,72,All Levels,4 hours,Discrete Mathematics
3,4,1765308,Master Discrete Math 2020: More Than 5 Complet...,https://www.udemy.com/course/discrete/,True,2999,23427,438,157,All Levels,16 hours,Discrete Mathematics
4,5,829846,Graph Theory,https://www.udemy.com/course/graph-theory/,True,3099,8104,912,67,All Levels,10 hours,Discrete Mathematics


In [56]:
df['course_title']

0                                   Discrete Mathematics
1      Master Discrete Mathematics: Sets, Math Logic,...
2      Discrete Mathematics: Beginner's Complete, Mat...
3      Master Discrete Math 2020: More Than 5 Complet...
4                                           Graph Theory
                             ...                        
136    Functional & Reactive programming in Java : Mo...
137    Common Lisp programming: from novice to effect...
138    Unreal Engine 5 C++ The Ultimate Game Develope...
139    8051 Microcontroller - Embedded C and Assembly...
140    8051 Microcontroller - An Assembly Language Pr...
Name: course_title, Length: 141, dtype: object

In [57]:
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr

In [58]:
df['clean_course_title'] = df['course_title'].apply(nfx.remove_stopwords)

In [59]:
df['clean_course_title'] = df['clean_course_title'].apply(nfx.remove_special_characters)

In [60]:
df[['course_title','clean_course_title']]

Unnamed: 0,course_title,clean_course_title
0,Discrete Mathematics,Discrete Mathematics
1,"Master Discrete Mathematics: Sets, Math Logic,...",Master Discrete Mathematics Sets Math Logic
2,"Discrete Mathematics: Beginner's Complete, Mat...",Discrete Mathematics Beginners Complete Math C...
3,Master Discrete Math 2020: More Than 5 Complet...,Master Discrete Math 2020 5 Complete Courses 1
4,Graph Theory,Graph Theory
...,...,...
136,Functional & Reactive programming in Java : Mo...,Functional Reactive programming Java Modern ...
137,Common Lisp programming: from novice to effect...,Common Lisp programming novice effective devel...
138,Unreal Engine 5 C++ The Ultimate Game Develope...,Unreal Engine 5 C Ultimate Game Developer Course
139,8051 Microcontroller - Embedded C and Assembly...,8051 Microcontroller Embedded C Assembly Lang...


In [61]:
#Vectorize our text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(df['clean_course_title'])

In [62]:
#Sparse Matrix
cv_mat

<141x265 sparse matrix of type '<class 'numpy.int64'>'
	with 706 stored elements in Compressed Sparse Row format>

In [63]:
#Dense Matrix
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [64]:
def_cv_words = pd.DataFrame(cv_mat.todense(),columns=count_vect.get_feature_names_out())

In [65]:
def_cv_words.head()

Unnamed: 0,01,101,17,20,2000,2020,2023,2024,72,8051,...,up,update,ux,vhdl,vision,visual,vue,web,x86,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
cosine_sim_mat = cosine_similarity(cv_mat)

In [67]:
cosine_sim_mat

array([[1.        , 0.57735027, 0.53452248, ..., 0.        , 0.        ,
        0.        ],
       [0.57735027, 1.        , 0.46291005, ..., 0.        , 0.        ,
        0.        ],
       [0.53452248, 0.46291005, 1.        , ..., 0.15430335, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.15430335, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.8       ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.8       ,
        1.        ]])

In [68]:
df.head()

Unnamed: 0.1,Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,subject,clean_course_title
0,1,614832,Discrete Mathematics,https://www.udemy.com/course/discrete-math/,True,3099,22437,3219,124,All Levels,19 hours,Discrete Mathematics,Discrete Mathematics
1,2,1528598,"Master Discrete Mathematics: Sets, Math Logic,...",https://www.udemy.com/course/master-discrete-m...,True,2499,9114,1398,46,All Levels,7 hours,Discrete Mathematics,Master Discrete Mathematics Sets Math Logic
2,3,1240674,"Discrete Mathematics: Beginner's Complete, Mat...",https://www.udemy.com/course/master-discrete-m...,True,2299,6303,152,72,All Levels,4 hours,Discrete Mathematics,Discrete Mathematics Beginners Complete Math C...
3,4,1765308,Master Discrete Math 2020: More Than 5 Complet...,https://www.udemy.com/course/discrete/,True,2999,23427,438,157,All Levels,16 hours,Discrete Mathematics,Master Discrete Math 2020 5 Complete Courses 1
4,5,829846,Graph Theory,https://www.udemy.com/course/graph-theory/,True,3099,8104,912,67,All Levels,10 hours,Discrete Mathematics,Graph Theory


In [69]:
course_indices = pd.Series(df.index,index=df['course_title']).drop_duplicates()

In [70]:
course_indices

course_title
Discrete Mathematics                                              0
Master Discrete Mathematics: Sets, Math Logic, and More           1
Discrete Mathematics: Beginner's Complete, Math Crash Course      2
Master Discrete Math 2020: More Than 5 Complete Courses In 1      3
Graph Theory                                                      4
                                                               ... 
Functional & Reactive programming in Java : Modern Style        136
Common Lisp programming: from novice to effective developer     137
Unreal Engine 5 C++ The Ultimate Game Developer Course          138
8051 Microcontroller - Embedded C and Assembly Language         139
8051 Microcontroller - An Assembly Language Programming         140
Length: 141, dtype: int64

In [71]:
course_indices['Functional & Reactive programming in Java : Modern Style']

136

In [72]:
idx = course_indices['Functional & Reactive programming in Java : Modern Style']

In [73]:
idx

136

In [74]:
scores = list(enumerate(cosine_sim_mat[idx]))

In [75]:
#Sort our courses per cosine score
sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)

In [76]:
sorted_scores

[(136, 1.0000000000000002),
 (130, 0.4330127018922194),
 (123, 0.4082482904638631),
 (121, 0.36514837167011077),
 (124, 0.31622776601683794),
 (53, 0.2886751345948129),
 (41, 0.2357022603955159),
 (43, 0.2357022603955159),
 (46, 0.2357022603955159),
 (54, 0.2357022603955159),
 (56, 0.2357022603955159),
 (59, 0.2357022603955159),
 (84, 0.2357022603955159),
 (57, 0.21821789023599242),
 (50, 0.20412414523193154),
 (55, 0.20412414523193154),
 (62, 0.20412414523193154),
 (111, 0.20412414523193154),
 (119, 0.20412414523193154),
 (44, 0.18257418583505539),
 (73, 0.18257418583505539),
 (90, 0.18257418583505539),
 (125, 0.18257418583505539),
 (126, 0.18257418583505539),
 (128, 0.18257418583505539),
 (132, 0.18257418583505539),
 (140, 0.18257418583505539),
 (40, 0.1666666666666667),
 (49, 0.1666666666666667),
 (51, 0.1666666666666667),
 (61, 0.1666666666666667),
 (91, 0.1666666666666667),
 (98, 0.1666666666666667),
 (117, 0.1666666666666667),
 (122, 0.1666666666666667),
 (135, 0.1666666666666667

In [77]:
# Omit itself(first value above)
sorted_scores[1:]

[(130, 0.4330127018922194),
 (123, 0.4082482904638631),
 (121, 0.36514837167011077),
 (124, 0.31622776601683794),
 (53, 0.2886751345948129),
 (41, 0.2357022603955159),
 (43, 0.2357022603955159),
 (46, 0.2357022603955159),
 (54, 0.2357022603955159),
 (56, 0.2357022603955159),
 (59, 0.2357022603955159),
 (84, 0.2357022603955159),
 (57, 0.21821789023599242),
 (50, 0.20412414523193154),
 (55, 0.20412414523193154),
 (62, 0.20412414523193154),
 (111, 0.20412414523193154),
 (119, 0.20412414523193154),
 (44, 0.18257418583505539),
 (73, 0.18257418583505539),
 (90, 0.18257418583505539),
 (125, 0.18257418583505539),
 (126, 0.18257418583505539),
 (128, 0.18257418583505539),
 (132, 0.18257418583505539),
 (140, 0.18257418583505539),
 (40, 0.1666666666666667),
 (49, 0.1666666666666667),
 (51, 0.1666666666666667),
 (61, 0.1666666666666667),
 (91, 0.1666666666666667),
 (98, 0.1666666666666667),
 (117, 0.1666666666666667),
 (122, 0.1666666666666667),
 (135, 0.1666666666666667),
 (137, 0.1666666666666667

In [78]:
#Selected Course Indices
selected_course_indices = [i[0] for i in sorted_scores[1:]]

In [79]:
selected_course_indices

[130,
 123,
 121,
 124,
 53,
 41,
 43,
 46,
 54,
 56,
 59,
 84,
 57,
 50,
 55,
 62,
 111,
 119,
 44,
 73,
 90,
 125,
 126,
 128,
 132,
 140,
 40,
 49,
 51,
 61,
 91,
 98,
 117,
 122,
 135,
 137,
 48,
 58,
 102,
 129,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 42,
 45,
 47,
 52,
 60,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 88,
 89,
 92,
 93,
 94,
 95,
 96,
 97,
 99,
 100,
 101,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 112,
 113,
 114,
 115,
 116,
 118,
 120,
 127,
 131,
 133,
 134,
 138,
 139]

In [80]:
#Selected Course Scores
selected_course_scores = [i[1] for i in sorted_scores[1:]]

In [81]:
df['course_title'].iloc[selected_course_indices]

130    Java Programming For Beginners | Core Java Usi...
123              Java Programming for Complete Beginners
121          Learn JAVA Programming - Beginner to Master
124    Object Oriented Programming - Basics to Advanc...
53                              C++ Programming Bootcamp
                             ...                        
131    React - The Complete Guide 2024 (incl. React R...
133    Vue - The Complete Guide (incl. Router & Compo...
134          Angular - The Complete Guide (2024 Edition)
138    Unreal Engine 5 C++ The Ultimate Game Develope...
139    8051 Microcontroller - Embedded C and Assembly...
Name: course_title, Length: 140, dtype: object

In [82]:
recommended_result = df['course_title'].iloc[selected_course_indices]

In [83]:
rec_df = pd.DataFrame(recommended_result)

In [84]:
rec_df.head()

Unnamed: 0,course_title
130,Java Programming For Beginners | Core Java Usi...
123,Java Programming for Complete Beginners
121,Learn JAVA Programming - Beginner to Master
124,Object Oriented Programming - Basics to Advanc...
53,C++ Programming Bootcamp


In [85]:
rec_df['similarity_scores'] = selected_course_scores

In [86]:
rec_df

Unnamed: 0,course_title,similarity_scores
130,Java Programming For Beginners | Core Java Usi...,0.433013
123,Java Programming for Complete Beginners,0.408248
121,Learn JAVA Programming - Beginner to Master,0.365148
124,Object Oriented Programming - Basics to Advanc...,0.316228
53,C++ Programming Bootcamp,0.288675
...,...,...
131,React - The Complete Guide 2024 (incl. React R...,0.000000
133,Vue - The Complete Guide (incl. Router & Compo...,0.000000
134,Angular - The Complete Guide (2024 Edition),0.000000
138,Unreal Engine 5 C++ The Ultimate Game Develope...,0.000000


In [88]:
def recommend_course(title,num_of_rec=10):
    #ID for title
    idx = course_indices[title]
    #Course Indice
    #Search inside cosine_sim_mat
    scores = list(enumerate(cosine_sim_mat[idx]))
    #Scores
    #Sorted Scores
    sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)
    #Recommend
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_scores = [i[1] for i in sorted_scores[1:]]
    result = df['course_title'].iloc[selected_course_indices]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_course_scores
    return rec_df.head(num_of_rec)
    

In [89]:
recommend_course('C++ Programming Bootcamp')

Unnamed: 0,course_title,similarity_scores
41,Beginning C++ Programming - From Beginner to B...,0.408248
43,C++ Programming for Beginners (2023),0.408248
54,Learn Advanced C++ Programming,0.408248
56,Object Oriented Programming in C++,0.408248
84,CUDA programming Masterclass with C++,0.408248
57,Python Object Oriented Programming - Object Or...,0.377964
124,Object Oriented Programming - Basics to Advanc...,0.365148
50,Object Oriented Programming in C++ Inside Out,0.353553
55,C++ programming step-by-step: From Beginner to...,0.353553
111,Programming Numerical Methods in Python,0.353553


In [92]:
recommend_course('React - The Complete Guide 2024 (incl. React Router & Redux)')

Unnamed: 0,course_title,similarity_scores
132,Modern React with Redux [2024 Update],0.565685
133,Vue - The Complete Guide (incl. Router & Compo...,0.478091
134,Angular - The Complete Guide (2024 Edition),0.424264
95,The Complete 2024 Software Testing Bootcamp,0.282843
107,Probability and Statistics: Complete Course 2024,0.282843
127,The Complete JavaScript Course 2024: From Zero...,0.258199
94,Complete SDLC : Software Development Life Cycl...,0.2
47,The Complete C++ Developer Course,0.182574
59,Complete Modern C++ (C++11/14/17),0.182574
123,Java Programming for Complete Beginners,0.158114
