## Feature Extraction

In [155]:
# Load All Necessary Packages

import os


import pandas as pd
import numpy as np
import re
import pickle
from collections import defaultdict
from collections import Counter
import sqlite3

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity

seed = 18

print('Versions of key libraries')
print('-------------------------')
print('pandas:  ', pd.__version__)
print('numpy:   ', np.__version__)
print('sklearn: ', sklearn.__version__)

Versions of key libraries
-------------------------
pandas:   1.5.2
numpy:    1.21.5
sklearn:  1.0.2


In [156]:
# Check & Query
filename = 'app_database.db'
table_name = 'course'
sqlite_conn = sqlite3.connect(filename)

# Query Table
rawdata = pd.read_sql('SELECT * FROM ' + table_name, sqlite_conn, index_col='courseID')

sqlite_conn.close()

In [157]:
print(rawdata.shape)
rawdata.head()

(20475, 18)


Unnamed: 0_level_0,title,url,categories,description_short,description_long,difficulty,duration,free_option,number_of_enroll,rating,paid_option,language,subtitle,platform,provider,image_url,popularity_index,keywords
courseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Getting Started with AWS Mainframe Modernizati...,https://explore.skillbuilder.aws/learn/course/...,"Computer Science , Cloud , AWS ,Cloud Computing",Description not availabel,Description not availabel,0,0,1,0,5.0,0,English,English,0,AWS Skill Builder,https://ccweb.imgix.net/https%3A%2F%2Fwww.clas...,0.415333,"getting, start, aws, mainframe, modernization,..."
2,Cloud for CEOs,https://explore.skillbuilder.aws/learn/course/...,"Computer Science , Cloud , AWS ,Cloud Computing",This course provides CEOs and presidents a hig...,This course provides CEOs and presidents a hig...,0,0,1,0,5.0,0,English,English,0,AWS Skill Builder,https://ccweb.imgix.net/https%3A%2F%2Fwww.clas...,0.415333,"cloud, ceo, course, provide, ceo, president, h..."
3,Getting Started with AWS Mainframe Modernizati...,https://explore.skillbuilder.aws/learn/course/...,"Computer Science , Cloud , AWS ,Cloud Computing",The AWS Mainframe Modernization service helps ...,The AWS Mainframe Modernization service helps ...,0,0,1,0,5.0,0,English,English,0,AWS Skill Builder,https://ccweb.imgix.net/https%3A%2F%2Fwww.clas...,0.415333,"getting, start, aws, mainframe, modernization,..."
4,Introduction to Robotics on AWS,https://explore.skillbuilder.aws/learn/course/...,"Computer Science , Cloud , AWS ,Cloud Computing",The robotics industry is growing at a rapid ra...,The robotics industry is growing at a rapid ra...,0,0,1,0,5.0,0,English,English,0,AWS Skill Builder,https://ccweb.imgix.net/https%3A%2F%2Fwww.clas...,0.415333,"introduction, robotic, aw, robotic, industry, ..."
5,Getting Started with Bottlerocket,https://explore.skillbuilder.aws/learn/course/...,"Computer Science , Cloud , AWS ,Cloud Computing","Bottlerocket is a Linux-based, open-source ope...","Bottlerocket is a Linux-based, open-source ope...",0,0,1,0,5.0,0,English,English,0,AWS Skill Builder,https://ccweb.imgix.net/https%3A%2F%2Fwww.clas...,0.415333,"getting, start, bottlerocket, bottlerocket, li..."


## 1. Feature Extraction for Text Based Data
### i) Extract Text Based Data

In [158]:
# Extract Text Based Columns - Name, Categories, Short Description and Long Description
rawdata_name = rawdata['title']
rawdata_cat = rawdata['categories']
rawdata_sdesc = rawdata['description_short']
rawdata_ldesc = rawdata['description_long']

print(rawdata_name.shape)
rawdata_name.head()

(20475,)


courseID
1    Getting Started with AWS Mainframe Modernizati...
2                                       Cloud for CEOs
3    Getting Started with AWS Mainframe Modernizati...
4                      Introduction to Robotics on AWS
5                    Getting Started with Bottlerocket
Name: title, dtype: object

In [159]:
rawdata_ldesc.head()

courseID
1                            Description not availabel
2    This course provides CEOs and presidents a hig...
3    The AWS Mainframe Modernization service helps ...
4    The robotics industry is growing at a rapid ra...
5    Bottlerocket is a Linux-based, open-source ope...
Name: description_long, dtype: object

In [160]:
rawdata_cat

courseID
1          Computer Science , Cloud , AWS ,Cloud Computing
2          Computer Science , Cloud , AWS ,Cloud Computing
3          Computer Science , Cloud , AWS ,Cloud Computing
4          Computer Science , Cloud , AWS ,Cloud Computing
5          Computer Science , Cloud , AWS ,Cloud Computing
                               ...                        
20471     Computer Networking, Network Model, Cryptogra...
20472     Data Visualization, Business Analysis, Comput...
20473     Computer Programming, Python Programming, Sta...
20474     Data Architecture, Data Warehousing, Database...
20475     Linear Algebra, Data Mining, Machine Learning...
Name: categories, Length: 20475, dtype: object

## ii) Text Preprocessing
# Text Preprocessing Functions


In [161]:

stopwordsdic = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def text_preprocess(rawtext):
    text = re.sub('([^\x00-\x7F])+','',rawtext) # Remove all non ASCII characters
    text = text.lower() # lower casing all words
    text = text.strip() # Remove White Spaces
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # Remove Punctuations
    text = word_tokenize(text) # Tokenize
    text = [word for word in text if word not in stopwordsdic] # Remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text] # Lemmatize words
    bow  = ' '.join(text) # Create Bag-of-Words
    return bow

In [162]:
# Apply Text Preprocessing
data_name = rawdata_name.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_cat = rawdata_cat.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_sdesc = rawdata_sdesc.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_ldesc = rawdata_ldesc.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)

In [163]:
data_cat 

courseID
1               computer science cloud aws cloud computing
2               computer science cloud aws cloud computing
3               computer science cloud aws cloud computing
4               computer science cloud aws cloud computing
5               computer science cloud aws cloud computing
                               ...                        
20471    computer networking network model cryptography...
20472    data visualization business analysis computer ...
20473    computer programming python programming statis...
20474    data architecture data warehousing database ap...
20475    linear algebra data mining machine learning di...
Name: categories, Length: 20475, dtype: object

In [164]:
data_ldesc.isna().sum()

0

## iii) Keyword Extractions for Short and Long Description 
# Keyword Extraction Function 

The method used for keyword extraction is based on the Natural Language Processing (NLP) library called spaCy. It involves using spaCy's pre-trained English language model to tokenize the input text, remove stopwords and punctuation, and calculate the frequency of each remaining word in the text. The method then ranks the words based on their frequency and selects the top N words as the keywords for the text. The ranking is done using a normalization factor to account for the difference in frequency between different texts. The method returns a list of the top N keywords for the input text.
The algorithm used here for keyword extraction is a simple frequency-based approach, where the frequency of each word in the text is calculated using the Counter function from the collections module. Then, the words are ranked based on their frequency, and the top n words are returned as keywords. 

In [165]:

import spacy
from collections import Counter
from string import punctuation

nlp = spacy.load("en_core_web_sm")


def extract_keywords(text):
    keywords=[]
    if text !='':
        doc = nlp(text.lower())

        words = [token.text for token in doc if not token.is_stop and not token.is_punct]
        if len(words) == 0:
            return []
        word_freq = Counter(words)
        max_freq = max(word_freq.values())
        ranked_words = [(word, freq/max_freq) for word, freq in word_freq.items()]
        ranked_words = sorted(ranked_words, key=lambda x: x[1], reverse=True)
        keywords = [word for word, score in ranked_words]
    return keywords


In [166]:
# Apply keyword extraction to short and long description


data_sdesc_kw = data_sdesc.apply(lambda x: '' if pd.isna(x) else ' '.join(extract_keywords(x)))
data_ldesc_kw = data_ldesc.apply(lambda x: '' if pd.isna(x) else ' '.join(extract_keywords(x)))

In [167]:
data_ldesc_kw

courseID
1                                    description availabel
2        cloud course provides ceo president high level...
3        aws mainframe service modernization help migra...
4        robotics industry growing rapid rate creating ...
5        bottlerocket container software linux based op...
                               ...                        
20471    technology network course world internet creat...
20472    course round world regatta lead boat mediterra...
20473    design use designer focused fundamental course...
20474    data big hadoop process course novice programm...
20475    learning machine able course hand case house d...
Name: description_long, Length: 20475, dtype: object

In [168]:
data_sdesc_kw

courseID
1                                    description availabel
2        cloud course provides ceo president high level...
3        aws mainframe service modernization help migra...
4        robotics industry growing rapid rate creating ...
5        bottlerocket container software linux based op...
                               ...                        
20471    technology network course world internet creat...
20472    course round world regatta lead boat mediterra...
20473    design use designer focused fundamental course...
20474    data big hadoop process course novice programm...
20475    learning machine able course hand case house d...
Name: description_short, Length: 20475, dtype: object

In [169]:
data_sdesc_kw[1]

'description availabel'

In [170]:
for i in range(1,len(data_sdesc_kw)+1):

    if ('description availabel') in data_sdesc_kw[i]:
        data_sdesc_kw[i]=''


In [171]:
for i in range(1,len(data_ldesc_kw)+1):

    if ('description availabel') in data_ldesc_kw[i]:
        data_ldesc_kw[i]=''

#### iv) Create Bag-of-words and Corresponding List of Tokens Per Course

In [172]:

data_name_npy = data_name.to_numpy()
data_cat_npy = data_cat.to_numpy()
data_sdesc_kw_npy = data_sdesc_kw.to_numpy()
data_ldesc_kw_npy = data_ldesc_kw.to_numpy()

In [173]:
data_ldesc_kw_npy

array(['',
       'cloud course provides ceo president high level picture computing technology learner explore consider started adoption journey',
       'aws mainframe service modernization help migration migrate modernize application amazon web managed runtime environment main pattern automated refactor replatform provides tool resource plan implement',
       ...,
       'design use designer focused fundamental course python application world increasingly impacted algorithm learn create computing program narrowly computer automation simple drafting modeling task instead explore extraordinary potential digitalization hold culture practice structured series problem code term rule syntax end know rhino script importantly lens geometrically lesson exercise',
       'data big hadoop process course novice programmer business people like understand core tool wrangle analyze prior experience opportunity walk hand example spark framework common industry comfortable explaining specific compon

In [174]:
data_cat_npy

array(['computer science cloud aws cloud computing',
       'computer science cloud aws cloud computing',
       'computer science cloud aws cloud computing', ...,
       'computer programming python programming statistical programming theoretical computer science computational thinking data management data structure programming principle',
       'data architecture data warehousing database application database distributed computing architecture apache python programming data structure big data computer architecture data management',
       'linear algebra data mining machine learning dimensionality reduction feature engineering machine learning algorithm statistical machine learning applied machine learning deep learning general statistic natural language processing statistical analysis python programming computer vision regression statistical test statistical visualization basic descriptive statistic correlation dependence data analysis estimation forecasting algorithm computer prog

In [175]:
# Combine all text to create bag-of-words for each course
data_bow = []

for i in range(len(data_name)):
    data_bow.append(' '.join((data_name_npy[i], data_cat_npy[i], data_sdesc_kw_npy[i], data_ldesc_kw_npy[i])).strip())

data_bow = np.array(data_bow)
data_bow.shape

(20475,)

In [176]:
data_bow[4544]

'aws certified solution architect associate saa c02 cert prep 4 compute service amazon web service aws service aws deep dive compute amazon web preparation 2020 certified solution architect associate saa c02 exam service aws deep dive compute amazon web preparation 2020 certified solution architect associate saa c02 exam'

## v) TfIdf Vectorization

In [179]:
# Fit_transform BoW to Tfidf Sparse Matrix

tfidf = TfidfVectorizer()
data_tfidf = tfidf.fit_transform(data_bow)

# Save Tfidf Vectorizer to file
folderpath = 'Feature Map/'
filename = 'tfidf_vectorizer'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(tfidf, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

# Save Tfidf Sparse Matrix to file

folderpath = 'Feature Map/'
filename = 'tfidf_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(data_tfidf, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

In [180]:
data_tfidf.shape

(20475, 19133)

In [182]:
atfidf.shape

(1, 19133)

In [183]:
# Load for checking
folderpath = 'Feature Map/'
filename = 'tfidf_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'rb')
data_tfidf = pickle.load(file)
file.close()

folderpath = 'Feature Map/'
filename = 'tfidf_vectorizer'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'rb')
tfidf = pickle.load(file)
file.close()

## 2. Feature Extraction for Categorical Data
### i) Extract Categorical Data

In [184]:

rawdata_diff = rawdata['difficulty']
rawdata_dur = rawdata['duration']
rawdata_free = rawdata['free_option']


ii ) One-Hot Encoding


In [185]:
data_diff = pd.get_dummies(rawdata_diff)
data_dur = pd.get_dummies(rawdata_dur)
data_free = pd.get_dummies(rawdata_free, drop_first=True)


iii) Combine Data to form Catagorical Data Feature Map

In [186]:

data_cat = np.hstack((data_dur, data_diff, data_free))
data_cat.shape

(20475, 8)

iv) Function to Encode Categorical Inputs

In [187]:

def encode_input(cat_input):
    cat_onehot = np.zeros(6)
    if cat_input[0] > 0: # 0 - No preference, 1 - Short, 2 - Medium, 3 - Long
        cat_onehot[cat_input[0] - 1] = 1
    if cat_input[1] > 0: # 0 - No preference, 1 - Introductory, 2 - Intermediate, 3 - Advanced
        cat_onehot[cat_input[1] + 2] = 1
    return cat_onehot


In [203]:
cat_input = [1,3] # difficulty = Medium, duration = Advanced
cat_onehot = encode_input(cat_input)
print(cat_onehot) 


[1. 0. 0. 0. 0. 1.]


## iv) Save Categorical Feature Map
### Save Categorical Feature Map to file



In [188]:
# Save Categorical Feature Map to file

folderpath = 'Feature Map/'
filename = 'categorical_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(data_cat, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

data_cat.shape

(20475, 8)

array([[1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0]], dtype=uint8)

# 3. Recommendation Inference
## i) Similarity Calculation
### Compute Similarity on the condition that each column feature is not 0: (0 - no preference)

In [255]:


def cond_sim(input_vec, data_vec):
    input_diff = input_vec[:, :3]
    input_durr = input_vec[:, 3:6]
    input_free = input_vec[:, 6:]
    data_diff  = data_vec[:, :3]
    data_durr  = data_vec[:, 3:6]
    data_free  = data_vec[:, 6:]
    if (input_diff.sum() + input_durr.sum()) == 0:
        input_slice = input_free
        data_slice  = data_free
    elif input_diff.sum() == 0:
        input_slice = np.hstack((input_durr, input_free))
        data_slice  = np.hstack((data_durr, data_free))
    elif input_durr.sum() == 0:
        input_slice = np.hstack((input_diff, input_free))
        data_slice  = np.hstack((data_diff, data_free))
    else:
        input_slice = input_vec
        data_slice  = data_vec
    sim = cosine_similarity(input_slice, data_slice).ravel()
    return sim
    

In [256]:
# ii) Ranking Optimization for Single Group

def ranking(mask, text_sim, cat_sim, rating):

    target_idx = np.arange(text_sim.shape[0])[mask]
    target_text_sim = text_sim[mask]
    target_cat_sim = cat_sim[mask]
    target_rating = rating[mask]
    
    target_scores = sorted(np.unique(target_cat_sim), reverse=True)
    
    rec_idx = np.array([], dtype=int)
    rec_sim = np.array([])
    
    for score in target_scores:
        group_mask = (target_cat_sim == score)
        group_idx = target_idx[group_mask]
        group_text_sim = target_text_sim[group_mask]
        group_rating = target_rating[group_mask]
        group_sort_idx = np.argsort(group_rating)[::-1]
        rec_idx = np.append(rec_idx, group_idx[group_sort_idx])
        rec_sim = np.append(rec_sim, group_text_sim[group_sort_idx])
    
    return rec_sim, rec_idx

In [257]:
#iii) Recommendation Function
def cond_sim(input_vec, data_vec):
    input_durr = input_vec[:, :3]
    input_diff = input_vec[:, 3:]
    data_durr  = data_vec[:, :3]
    data_diff  = data_vec[:, 3:]
    if (input_durr.sum() + input_diff.sum()) == 0:
        sim = np.ones(data_vec.shape[0])
    elif input_durr.sum() == 0:
        sim = cosine_similarity(input_diff, data_diff).ravel()
    elif input_diff.sum() == 0:
        sim = cosine_similarity(input_durr, data_durr).ravel()
    else:
        sim = cosine_similarity(input_vec, data_vec).ravel()
    return sim

In [259]:

def recommend(inputs, thres, nmin):
    text_ipt = inputs[0]
    text_proc = text_preprocess(text_ipt)
    text_tfidf = tfidf.transform([text_proc])
    text_sim = cosine_similarity(text_tfidf, data_tfidf).ravel()

    cat_ipt  = inputs[1:3]
    cat_onehot = np.array([encode_input(cat_ipt)])
    cat_sim = cond_sim(cat_onehot, data_cat[:, :-1])
    
    ind = inputs[-1]
    
    thres_mask = (text_sim > thres)
    
    if ind == 1:
        free_mask = ((rawdata_free.to_numpy() == 1) * thres_mask) == 1
    else:
        free_mask = (np.ones(data_tfidf.shape[0]) * thres_mask) == 1
    
    paid_mask = ((np.ones(data_tfidf.shape[0]) * thres_mask) - free_mask) == 1
        
    print('thresss ' ,thres_mask.sum())
    print(free_mask.sum())
    print(paid_mask.sum())
    
    rec_sim, rec_idx = ranking(free_mask, text_sim, cat_sim, rawdata_rating.to_numpy())
    
    if (free_mask.sum() < nmin) and (paid_mask.sum() > 0):
        paid_sim, paid_idx = ranking(paid_mask, text_sim, cat_sim, rawdata_rating.to_numpy())
        rec_sim = np.append(rec_sim, paid_sim)
        rec_idx = np.append(rec_idx, paid_idx)

    return rec_sim, rec_idx, ind

# iv) Testing

In [260]:

rawdata_rating = rawdata['popularity_index']
import time
start = time.time()

ainput = ['data structures', 0, 0, 1]

a_sim, a_idx, a_ind = recommend(ainput, 0.5, 30)
print(a_ind)
print(a_sim.shape)
print(a_sim.sum())
# print((a_sim>(a_sim.max()-0.1)).sum())
# print(((a_sim>(a_sim.max()-0.1)).sum())/a_sim.shape[0])
print('START =================================================================================')
print(a_sim[:20])
print(a_idx[:20]+1)
print(np.array(rawdata['duration'])[a_idx][:20])
print(np.array(rawdata['difficulty'])[a_idx][:20])
print(np.array(rawdata['popularity_index'])[a_idx][:20])
print(np.array(rawdata['title'])[a_idx][:20])
print(np.array(rawdata['url'])[a_idx][:20])

print('END ===================================================================================')
print(a_sim[-20:])
print(a_idx[-20:]+1)
print(np.array(rawdata['duration'])[a_idx][-20:])
print(np.array(rawdata['difficulty'])[a_idx][-20:])
print(np.array(rawdata['title'])[a_idx][-20:])
print('TIME ==================================================================================')
print(time.time()-start)

thresss  53
1
52
1
(53,)
34.61858326960863
[0.6636283  0.59648901 0.53929666 0.73335361 0.51101915 0.76198431
 0.59837496 0.68921725 0.54261274 0.55352903 0.66469272 0.58842388
 0.59644371 0.81715702 0.54933452 0.56768844 0.72936715 0.52779777
 0.86665954 0.5827612 ]
[ 2577  6870  7004 16111 13955 16762  3586 11857  3974  4932 12971 16751
 13031 11888  6270 12850 12455 14708  6681 15055]
[0 0 1 0 0 1 0 2 0 0 1 0 1 0 1 1 1 0 0 0]
[0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0]
[0.23490543 1.06380897 1.06380897 1.06380897 1.04476499 1.04476499
 1.04476499 1.04476499 1.04476499 1.04476499 1.04476499 0.90614983
 0.90614983 0.90614983 0.90614983 0.90614983 0.74815619 0.62663377
 0.62663377 0.62663377]
['Data Structures - Full Course Using C and C++'
 'Learning Data Structures and Algorithms'
 'Algorithms and Data Structures in Java - Part II'
 'Algorithms and Data Structures in Javascript (2020)'
 'Data Structures and Algorithms in java'
 'Data Structures in JavaScript: Master The Fundamentals'
 