In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
dataset = pd.read_csv('./all-course-data.csv')

In [4]:
len(set(dataset.level))

12

In [5]:
dataset.columns

Index(['course_index', 'course_title', 'url', 'certification', 'level',
       'organization', 'platform'],
      dtype='object')

In [6]:
dataset["difficulty"] = dataset["level"].map({"Advanced":100, "advanced":90, "All Levels":50, "Mixed":60, "Expert Level":80, "Intermediate": 40, "intermediate ":40, "intermediate":40, "Intermediate Level":45, "beginner":10, "Beginner":20, "Beginner Level":30})
set(dataset["difficulty"])

{10, 20, 30, 40, 45, 50, 60, 80, 90, 100}

In [7]:
dataset.drop("level", axis=1, inplace=True)

In [8]:
dataset.head(10)

Unnamed: 0,course_index,course_title,url,certification,organization,platform,difficulty
0,0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,COURSE,Business Finance,udemy,50
1,1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,COURSE,Business Finance,udemy,50
2,2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,COURSE,Business Finance,udemy,45
3,3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,COURSE,Business Finance,udemy,50
4,4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,COURSE,Business Finance,udemy,45
5,5,Trading Penny Stocks: A Guide for All Levels I...,https://www.udemy.com/trading-penny-stocks-a-g...,COURSE,Business Finance,udemy,50
6,6,Investing And Trading For Beginners: Mastering...,https://www.udemy.com/investing-and-trading-fo...,COURSE,Business Finance,udemy,30
7,7,"Trading Stock Chart Patterns For Immediate, Ex...",https://www.udemy.com/trading-chart-patterns-f...,COURSE,Business Finance,udemy,50
8,8,Options Trading 3 : Advanced Stock Profit and ...,https://www.udemy.com/day-trading-stock-option...,COURSE,Business Finance,udemy,80
9,9,The Only Investment Strategy You Need For Your...,https://www.udemy.com/the-only-investment-stra...,COURSE,Business Finance,udemy,50


In [9]:
features = ["course_title", "course_index", "difficulty", "platform"]
def feature_combination(row):
    return row["course_title"] + " " + row["platform"] + " " + str(row["course_index"]) + " " + str(row["difficulty"])

dataset["cumulative_features"] = dataset.apply(feature_combination, axis = 1)

In [10]:
dataset

Unnamed: 0,course_index,course_title,url,certification,organization,platform,difficulty,cumulative_features
0,0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,COURSE,Business Finance,udemy,50,Ultimate Investment Banking Course udemy 0 50
1,1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,COURSE,Business Finance,udemy,50,Complete GST Course & Certification - Grow You...
2,2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,COURSE,Business Finance,udemy,45,Financial Modeling for Business Analysts and C...
3,3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,COURSE,Business Finance,udemy,50,Beginner to Pro - Financial Analysis in Excel ...
4,4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,COURSE,Business Finance,udemy,45,How To Maximize Your Profits Trading Options u...
5,5,Trading Penny Stocks: A Guide for All Levels I...,https://www.udemy.com/trading-penny-stocks-a-g...,COURSE,Business Finance,udemy,50,Trading Penny Stocks: A Guide for All Levels I...
6,6,Investing And Trading For Beginners: Mastering...,https://www.udemy.com/investing-and-trading-fo...,COURSE,Business Finance,udemy,30,Investing And Trading For Beginners: Mastering...
7,7,"Trading Stock Chart Patterns For Immediate, Ex...",https://www.udemy.com/trading-chart-patterns-f...,COURSE,Business Finance,udemy,50,"Trading Stock Chart Patterns For Immediate, Ex..."
8,8,Options Trading 3 : Advanced Stock Profit and ...,https://www.udemy.com/day-trading-stock-option...,COURSE,Business Finance,udemy,80,Options Trading 3 : Advanced Stock Profit and ...
9,9,The Only Investment Strategy You Need For Your...,https://www.udemy.com/the-only-investment-stra...,COURSE,Business Finance,udemy,50,The Only Investment Strategy You Need For Your...


In [11]:
cv = CountVectorizer()
_stopwords = stopwords.words('english')

In [12]:
user_preference = input("Input language/fav course : ")    #To come from front end of application
user_difficulty = 40

def tokenize_remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    word_tokens_cleaned = {word for word in word_tokens if word not in _stopwords}
    return ' '.join(list(word_tokens_cleaned))

user_preference_string = tokenize_remove_stopwords(user_preference).title()
print(user_preference_string)

Input language/fav course : Python
Python


In [13]:
required_data = dataset[features]
print(len(required_data))
required_data.loc[len(required_data)] = [user_preference_string, len(required_data), user_difficulty, "None"]  #50 is default difficulty value. This will come from application
word_bag = cv.fit_transform(required_data["course_title"])
word_list = word_bag.toarray()
csim = cosine_similarity(word_list)

4909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
required_data.tail(5)

Unnamed: 0,course_title,course_index,difficulty,platform
4905,Windows PowerShell Toolmaking Fundamentals,176,40,pluralsight
4906,Windows Server 2012 R2 (70-413) Server Deployment,177,40,pluralsight
4907,Windows Server Administration Fundamentals Usi...,178,20,pluralsight
4908,Working With Temporal Data in SQL Server,179,40,pluralsight
4909,Python,4909,40,


In [16]:
similar_courses = list(enumerate(csim[len(required_data) - 1]))
sorted_similar_courses = sorted(similar_courses,key=lambda x:x[1],reverse=True)[1:50]
i=0
recommended_courses = dict()
for element in sorted_similar_courses:
    print(required_data.loc[element[0]])
    course = required_data.loc[element[0]]
    if(course["difficulty"] >= user_difficulty - 15 and course["difficulty"] <= user_difficulty + 15 and element[0] != len(required_data) - 1):
        recommended_courses[course["course_index"], course["platform"]] = 100 - abs(user_difficulty - course["difficulty"])
        
print(recommended_courses)

course_title    Python Basics
course_index              682
difficulty                 20
platform             coursera
Name: 4520, dtype: object
course_title    Погружение в Python
course_index                    885
difficulty                       40
platform                   coursera
Name: 4723, dtype: object
course_title    Python for Beginners: Python Programming Langu...
course_index                                                 2681
difficulty                                                     30
platform                                                    udemy
Name: 2681, dtype: object
course_title    Introducción a la programación en Python I: Ap...
course_index                                                  466
difficulty                                                     20
platform                                                 coursera
Name: 4304, dtype: object
course_title    Complete Python Web Course: Build 8 Python Web...
course_index                          