In [42]:
import pandas as pd
import neattext as nfx
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
#!pip install neattext

In [44]:
data = pd.read_csv('udemy_courses.csv')

In [45]:
data.head(2)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance


In [46]:
data.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [47]:
data.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [48]:
data.duplicated().any()

True

In [49]:
data[data.duplicated()]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [50]:
data = data.drop_duplicates()

In [51]:
data.shape

(3672, 12)

# Popularity-Based Recommendation System

In [52]:
data.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [53]:
def popularity_based_recommendation(data, top_n=5):
    # Calculate popularity score for each course
    data['popularity_score'] = 0.6 * data['num_subscribers'] + 0.4 * data['num_reviews']
    
    # Sort courses by popularity score in descending order
    df_sorted = data.sort_values(by='popularity_score', ascending=False)
    
    # Return the recommended courses (course titles and popularity scores)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    
    return recommended_courses

In [54]:
popularity_based_recommendation(data)

Unnamed: 0,course_title,popularity_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2


# Content Based Recommendation System

In [55]:
data['course_title'] = data['course_title'].apply(nfx.remove_stopwords)
data['course_title'] = data['course_title'].apply(nfx.remove_special_characters)

In [56]:
data['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course  Certification  Grow Practice
2        Financial Modeling Business Analysts Consultants
3             Beginner Pro  Financial Analysis Excel 2017
4                        Maximize Profits Trading Options
                              ...                        
3673      Learn jQuery Scratch  Master JavaScript library
3674                      Design WordPress Website Coding
3675                                  Learn Build Polymer
3676        CSS Animations Create Amazing Effects Website
3677              MODX CMS Build Websites Beginners Guide
Name: course_title, Length: 3672, dtype: object

In [57]:
data.sample(5)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,popularity_score
3415,417914,GRUNT js Automate web development tasks save time,https://www.udemy.com/gruntjs-automate-web-dev...,True,55,1534,33,18,All Levels,1.5,2015-03-02T23:34:14Z,Web Development,933.6
2650,763774,JavaScript Game Development Create Breakout Game,https://www.udemy.com/javascript-game-developm...,False,0,10179,314,21,All Levels,1.5,2016-09-26T23:44:51Z,Web Development,6233.0
441,598722,Build Grow Protect Assets StepbyStep Guide,https://www.udemy.com/ron-delegges-crash-cours...,True,195,226,26,26,All Levels,3.5,2015-09-23T03:00:27Z,Business Finance,146.0
3515,56513,Social Network Web Development,https://www.udemy.com/how-to-create-a-social-n...,True,200,2698,29,57,All Levels,8.0,2013-09-03T06:29:24Z,Web Development,1630.4
1632,908664,Word Swag Create Stunning Images Word Swag App,https://www.udemy.com/word-swag/,True,100,2416,19,14,All Levels,1.0,2016-08-09T17:56:53Z,Graphic Design,1457.2


In [58]:
data['title_subject'] = data['course_title'] + ''+data['subject']

In [59]:
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(data['title_subject']).toarray()

In [60]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
similarity = cosine_similarity(vectors)

In [63]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(417, 0.6),
 (39, 0.5477225575051662),
 (657, 0.5477225575051662),
 (1066, 0.5477225575051662),
 (227, 0.50709255283711)]

In [68]:
data.iloc[39]['course_title']

'Complete Investment Banking Course 2017'

In [69]:
def recommend(course):
    #Let's fetch the index
    course_index = data[data['course_title']==course].index[0]
    distances = similarity[course_index]
    courses_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in courses_list:
        print(data.iloc[i[0]]['course_title'])

In [70]:
recommend("Ultimate Investment Banking Course")

Investment Banking Recruitment Series
Complete Investment Banking Course 2017
Financial Accounting  Ultimate Beginner Course
Managerial Accounting  Ultimate Beginner Course
Investment Banking Land Job Wall Street


In [72]:
import pickle

In [73]:
#pickle.dump(data.to_dict(),open('course_dict.pkl','wb'))
pickle.dump(data,open('course_dict.pkl','wb'))

In [74]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd

# Assume 'data', 'similarity' are defined elsewhere

# Define the popularity-based recommendation function
def popularity_based_recommendation(df, top_n=5):
    df['popularity_score'] = 0.6 * df['num_subscribers'] + 0.4 * df['num_reviews']
    df_sorted = df.sort_values(by='popularity_score', ascending=False)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    return recommended_courses

# Define the recommend function
def recommend(course):
    try:
        course_index = data[data['course_title'] == course].index[0]
        distances = similarity[course_index]
        courses_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        recommended_courses = [data.iloc[i[0]]['course_title'] for i in courses_list]
        return recommended_courses
    except IndexError:
        messagebox.showerror("Error", f"Course '{course}' not found.")

# Event handler for the "Recommend" button
def recommend_button_click():
    course_title = course_var.get()
    recommended_courses = recommend(course_title)
    if recommended_courses:
        popularity_label.pack_forget()
        result_label.config(text="Recommended Courses:\n" + '\n'.join(recommended_courses))

# Create the main application window
root = tk.Tk()
root.title("Course Recommender")
root.geometry("400x300")

# Change font and color
font_style = ("Arial", 12)
label_color = "blue"
heading_color="red"
button_color = "green"
result_label_color = "black"

# Create and place GUI elements
label = tk.Label(root, text="Select Course:", font=font_style, fg=label_color)
label.pack(pady=10)

course_titles = data['course_title'].tolist()
course_var = tk.StringVar(value=course_titles[0])
course_dropdown = ttk.Combobox(root, textvariable=course_var, values=course_titles, width=40, font=font_style)
course_dropdown.pack(pady=5)

popularity_recommendations = popularity_based_recommendation(data, top_n=5)
popularity_label = tk.Label(root, text="Popularity-based Recommendations:\n" + popularity_recommendations.to_string(index=False),
                             font=font_style, fg=label_color)
popularity_label.pack()

recommend_button = tk.Button(root, text="Recommend", command=recommend_button_click, width=20, font=font_style, fg=button_color)
recommend_button.pack(pady=10)

result_label = tk.Label(root, text="", wraplength=350, font=font_style, fg=result_label_color)
result_label.pack()

root.mainloop()