In [1]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
from nltk.stem import WordNetLemmatizer # for lemmatization

In [2]:
courses = pd.read_csv("courses-cleaned.csv")

In [3]:
courses

Unnamed: 0.1,Unnamed: 0,id,title,is_paid,price,course_cover_image,headline,instructor,course_url,ratings
0,0,473160,Web Design for Web Developers: Build Beautiful...,False,Free,https://img-c.udemycdn.com/course/480x270/4731...,Learn web design in 1 hour with 25+ simple-to-...,Jonas Schmedtmann,https://www.udemy.com/course/web-design-secrets/,4.5
1,1,24823,Java Tutorial for Complete Beginners,False,Free,https://img-c.udemycdn.com/course/480x270/2482...,Learn to program using the Java programming la...,John Purcell,https://www.udemy.com/course/java-tutorial/,4.5
2,2,433798,Introduction To Python Programming,False,Free,https://img-c.udemycdn.com/course/480x270/4337...,A Quick and Easy Intro into Python Programming,Avinash Jain,https://www.udemy.com/course/pythonforbeginner...,4.4
3,3,53600,Useful Excel for Beginners,False,Free,https://img-c.udemycdn.com/course/480x270/5360...,Learn the basics of Microsoft Excel and become...,Dinesh Natarajan Mohan,https://www.udemy.com/course/useful-excel-for-...,4.5
4,4,247190,C++ Tutorial for Complete Beginners,False,Free,https://img-c.udemycdn.com/course/480x270/2471...,How to program in the popular (and tricky!) C+...,John Purcell,https://www.udemy.com/course/free-learn-c-tuto...,4.5
...,...,...,...,...,...,...,...,...,...,...
9702,9702,4009994,Build Self Order Kiosk Like Mcdonalds (React C...,True,"₹3,399",https://img-c.udemycdn.com/course/480x270/4009...,Be master in react context and hooks step by s...,Bassir Jafarzadeh,https://www.udemy.com/course/build-self-order-...,4.4
9703,9703,4011324,AWS Certified Cloud Practitioner Practice Exams,True,₹799,https://img-c.udemycdn.com/course/480x270/4011...,Updated for 2022 # Become AWS Certified Cloud ...,Certify Studio,https://www.udemy.com/course/aws-certified-clo...,3.8
9704,9704,4011432,"FLASK FAST AND EASY COURSE, BUILD BLOG WEB APP...",True,"₹3,499",https://img-c.udemycdn.com/course/480x270/4011...,WE COVER EVERYTHING YOU NEED TO KNOW TO BUILD ...,StudyEasy Organisation,https://www.udemy.com/course/flask-fast/,0.0
9705,9705,4011622,Adobe Illustrator Master Class,True,"₹3,499",https://img-c.udemycdn.com/course/480x270/4011...,Become a pro at Adobe Illustrator,Jeremy Mura,https://www.udemy.com/course/adobe-illustrator...,0.0


#### Preprocessing

In [4]:
courses.head()
courses.drop(columns=["Unnamed: 0"],inplace=True)

In [5]:
courses.isnull().sum()

id                    0
title                 0
is_paid               0
price                 0
course_cover_image    0
headline              0
instructor            0
course_url            0
ratings               0
dtype: int64

In [6]:
courses.duplicated().sum()

701

#### Model preparation

In [7]:
dataset = courses

In [8]:
# remove special character from course title
dataset['tag'] = dataset['title'].apply(nfx.remove_special_characters)

In [9]:
# convert tag case from upper to lower case
dataset['tag'] = dataset['tag'].apply(lambda x:x.lower())

# convert title case from upper to lower case
dataset['title'] = dataset['title'].apply(lambda x:x.lower())

In [10]:
# text vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
vectors = cv.fit_transform(dataset['tag']).toarray()

In [11]:
# stamming, lemmatization process
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# lemmatizer.lemmatize("history")

# stem function
def stemText(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

def lemText(text):
    y = []
    
    for i in text.split():
        y.append(lemmatizer.lemmatize(i))
    
    return " ".join(y)

In [12]:
dataset['tags']=dataset['tag'].apply(lemText)

In [13]:
# similarity measure
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.        , 0.        , ..., 0.30151134, 0.        ,
        0.12598816],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.30151134, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.12598816, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [14]:
# # Recommendation Function
# def recommend(course):
#     course_index = dataset[dataset['title']==course].index[0]
#     distances = similarity[course_index]
#     course_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:5]
    
#     for i in course_list:
#         print(dataset.iloc[i[0]].title)

In [15]:
# recommend("web design for web developers: build beautiful websites!")

#### TEST

In [24]:
course_index = pd.Series(dataset.index,index=dataset['title'])
course_index

title
web design for web developers: build beautiful websites!           0
java tutorial for complete beginners                               1
introduction to python programming                                 2
useful excel for beginners                                         3
c++ tutorial for complete beginners                                4
                                                                ... 
build self order kiosk like mcdonalds (react context, mui)      9702
aws certified cloud practitioner practice exams                 9703
flask fast and easy course, build blog web app, with flask!!    9704
adobe illustrator master class                                  9705
logo design: create geometric, iconic, wordmark, monograms      9706
Length: 9707, dtype: int64

In [25]:
title="machine learning algorithms in 7 days"
index=course_index[title]
index

9400

In [26]:
scores=list(enumerate(similarity[index]))
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.1889822365046136),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.1889822365046136),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.0),
 (78, 0.0),
 (79, 0.0),
 (80, 0.0),
 (81, 0.

In [27]:
sorted_score = sorted(scores,key=lambda x:x[1],reverse=True)
sorted_score

[(9400, 1.0),
 (1573, 0.7071067811865475),
 (1581, 0.7071067811865475),
 (6174, 0.5773502691896258),
 (521, 0.5),
 (1141, 0.5),
 (2742, 0.5),
 (3795, 0.5),
 (4548, 0.5),
 (5128, 0.5),
 (5135, 0.5),
 (6173, 0.5),
 (6185, 0.5),
 (6216, 0.5),
 (1372, 0.4472135954999579),
 (1760, 0.4472135954999579),
 (2305, 0.4472135954999579),
 (2308, 0.4472135954999579),
 (3340, 0.4472135954999579),
 (3413, 0.4472135954999579),
 (3422, 0.4472135954999579),
 (3428, 0.4472135954999579),
 (4241, 0.4472135954999579),
 (4751, 0.4472135954999579),
 (5734, 0.4472135954999579),
 (5864, 0.4472135954999579),
 (6354, 0.4472135954999579),
 (7177, 0.4472135954999579),
 (7429, 0.4472135954999579),
 (998, 0.4082482904638631),
 (1322, 0.4082482904638631),
 (1328, 0.4082482904638631),
 (1788, 0.4082482904638631),
 (3704, 0.4082482904638631),
 (5575, 0.4082482904638631),
 (7012, 0.4082482904638631),
 (7121, 0.4082482904638631),
 (7262, 0.4082482904638631),
 (7430, 0.4082482904638631),
 (8889, 0.4082482904638631),
 (8989,

In [28]:
# assigning indices of sorted_score to 'selected_course_index'.
selected_course_index = [i[0] for i in sorted_score[1:]]

# assigning similarity score of sorted_score to 'selected_course_score'
selected_course_score = [i[1] for i in sorted_score[1:]]

In [29]:
# creating df that contains recommended courses.
rec_df = dataset.iloc[selected_course_index]

In [30]:
# assigning 'selected_course_score' to newly created df('rec_df')
rec_df['similarity_score'] = selected_course_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['similarity_score'] = selected_course_score


In [31]:
# creating final df, that contains following columns 
final_rec_courses=rec_df[['title', 'is_paid', 'price', 'course_cover_image', 'headline','instructor', 'course_url', 'ratings','similarity_score']]

# getting only those courses which are having similarity_score>0.5
final_rec_courses.loc[final_rec_courses.similarity_score>0.5]

Unnamed: 0,title,is_paid,price,course_cover_image,headline,instructor,course_url,ratings,similarity_score
1573,what is machine learning?,False,Free,https://img-c.udemycdn.com/course/480x270/3618...,"An overview of Supervised, Unsupervised, and R...",Satish Reddy,https://www.udemy.com/course/what-is-machine-l...,4.5,0.707107
1581,what is machine learning?,False,Free,https://img-c.udemycdn.com/course/480x270/3618...,"An overview of Supervised, Unsupervised, and R...",Satish Reddy,https://www.udemy.com/course/what-is-machine-l...,4.5,0.707107
6174,machine learning with python,False,Free,https://img-c.udemycdn.com/course/480x270/4111...,"Machine Lerning with Python,Supervised,Unsuper...",Vijay A,https://www.udemy.com/course/machine-learning-...,4.1,0.57735


# Top 50 course

In [6]:
dataset_50 = pd.read_csv("top-50_cleaned.csv")

In [9]:
dataset_50.drop(columns=['Unnamed: 0'],inplace=True)

#### preprocessing

In [14]:
dataset_50.isnull().sum()

id                    0
title                 0
is_paid               0
price                 0
course_cover_image    0
headline              0
instructor            0
course_url            0
ratings               0
dtype: int64

In [16]:
dataset_50.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  50 non-null     int64  
 1   title               50 non-null     object 
 2   is_paid             50 non-null     bool   
 3   price               50 non-null     object 
 4   course_cover_image  50 non-null     object 
 5   headline            50 non-null     object 
 6   instructor          50 non-null     object 
 7   course_url          50 non-null     object 
 8   ratings             50 non-null     float64
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 3.3+ KB


In [20]:
dataset_50.duplicated().sum()

0

#### Model preparation

In [31]:
# not required

--------------------------------------------------

In [12]:
vectors

Unnamed: 0,akash,enrolment,is,my,name,sixteen
0,1,0,1,1,1,0
1,0,1,1,1,0,1
