In [1]:
# Initialize Library Setup

import pandas as pd
import numpy as np
import re
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Check & Query
filename = 'C:/Users/DELL/Desktop/Course_Recommendation/BEProject/SystemCode/instance/mydb.db'
table_name = 'course'
sqlite_conn = sqlite3.connect(filename)

# Query Table
rawdata = pd.read_sql('SELECT * FROM ' + table_name, sqlite_conn, index_col='courseID')

sqlite_conn.close()

In [3]:
# Utility functions for recommendation module




# 1) Text Preprocessing
# Initilization
stopwordsdic = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


# Takes any rawtext as input and apply text preprocessing:
#   - remove all non-ASCII characters
#   - lower-casing all text and remove unecessary spaces
#   - remove punctuations
#   - remove stopwords
#   - lemmatize words
#   - create bag-of-words (bow) strings
def text_preprocess(rawtext):
    text = re.sub('([^\x00-\x7F])+', '', rawtext)  # Remove all non ASCII characters
    text = text.lower()  # lower casing all words
    text = text.strip()  # Remove White Spaces
    text = re.sub('[^A-Za-z0-9]+', ' ', text)  # Remove Punctuations
    text = word_tokenize(text)  # Tokenize
    text = [word for word in text if word not in stopwordsdic]  # Remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize words
    bow = ' '.join(text)  # Create Bag-of-Words
    return bow


# 2) Encoding User Input Features:
# Takes list of categorical data (course difficulty, course duration and course free option) as input
# Returns one-hot encoded features.
def categorical_encode(categorical_input):
    encode = np.zeros((1, 6))
    # Binary Encode Course Duration (0 - No Preference, 1 - Short, 2 - Medium, 3 - Long)
    if categorical_input[0] > 0:
        encode[0, categorical_input[0] - 1] = 1
    # Binary Encode Course Difficulty (0 - No Preference, 1 - Introductory, 2 - Intermediate, 3 - Advanced)
    if categorical_input[1] > 0:
        encode[0, categorical_input[1] + 2] = 1
        
    return encode


# 3) TfIdf Vectorizer:
# Takes list of tokens as input and apply TfIdf Vectorization based on the pretrained dictionary.
def tfidf_vectorize(text, vectorizer):
    # Load Tfidf Vectorizer
    # vectorizer_file = open(config.tfidf_vectorizer_filepath, 'rb')
    # vectorizer = pickle.load(vectorizer_file)
    # vectorizer_file.close()
    tfidf = vectorizer.transform([text])
    return tfidf


# 4) Cosine Similarity:
# Takes 2 vectors and calculate cosine similarity
def cond_sim(input_vec, data_vec):
    input_durr = input_vec[:, :3]
    input_diff = input_vec[:, 3:]
    data_durr = data_vec[:, :3]
    data_diff = data_vec[:, 3:6]
    # print(str(input_durr)+ " diff "+str(input_diff)+" data dur"+str(data_durr)+" "+str(data_diff))
    # print('inp vect '+ str(input_vec)+ 'len '+str((input_vec.shape)))
    # print('data vect '+ str(data_vec)+ 'len '+str((data_vec.shape)))
    if (input_diff.sum() + input_durr.sum()) == 0:
        sim = np.ones(data_vec.shape[0])
    elif input_durr.sum() == 0:
        sim = cosine_similarity(input_diff, data_diff)
    elif input_diff.sum() == 0:
        sim = cosine_similarity(input_durr, data_durr)
    else:
        data_vec=data_vec[:,:6]
        sim = cosine_similarity(input_vec, data_vec)
  
    return sim


# 5) Ranking based on popularity index
# Given a sorted and threshold filtered ID of recommendations
# Batch rank for every batch_size of ID by rating.
def ranking(mask, text_sim, categorical_sim, rating):
    target_idx = np.arange(text_sim.shape[0])[mask]
    target_text_sim = text_sim[mask]
    target_categorical_sim = categorical_sim[mask]
    target_rating = rating[mask]
    target_scores = sorted(np.unique(target_categorical_sim), reverse=True)
    rec_idx = np.array([], dtype=int)
    rec_sim = np.array([])
    for score in target_scores:
        group_mask = (target_categorical_sim == score)
        group_idx = target_idx[group_mask]
        group_text_sim = target_text_sim[group_mask]
        group_rating = target_rating[group_mask]
        group_sort_idx = np.argsort(group_rating)[::-1]
        rec_idx = np.append(rec_idx, group_idx[group_sort_idx])
        rec_sim = np.append(rec_sim, group_text_sim[group_sort_idx])
    return rec_sim, rec_idx


# 6) Load-up Pickle Object Data Files
def load_pickle(filename):
    data_file = open(filename, 'rb')
    data = pickle.load(data_file)
    data_file.close()
    return data

In [4]:

# Recommendation module
# Takes user input and returns a sorted list of recommended courses.

# Initialize Library Setup





# Recommend Function
def recommend(user_input, rating_data, tfidf_vectorizer, tfidf_data, categorical_data):
    # 1. Feature Extraction - Text Based (TfIdf)
    # Load Tfidf Data Sparse Matrix
    # tfidf_data_file = open(config.tfidf_data_filepath, 'rb')
    # tfidf_data = pickle.load(tfidf_data_file)
    # tfidf_data_file.close()
    # Text Input and Similarity Score
    text_input = user_input[0]
    #print('text input:  ',text_input)
    text_processed = text_preprocess(text_input)
    tfidf_vect = tfidf_vectorize(text_processed, tfidf_vectorizer)
    tfidf_sim = cosine_similarity(tfidf_vect, tfidf_data).ravel()
    
  
   
    # 2. Feature Extraction - Categorical Based (One-Hot Encoded)
    # Load Categorical One-Hot Encoded Sparse Matrix

    # categorical_data_file = open(config.categorical_data_filepath, 'rb')
    # categorical_data = pickle.load(categorical_data_file)
    # categorical_data_file.close()
    # Categroical Input and Similarity Score
    categorical_input = user_input[1:3]
    #print('cipt: ',categorical_input)
    
    categorical_vect =categorical_encode(categorical_input)

    categorical_sim = cond_sim(categorical_vect, categorical_data[:, :-1]).ravel()

    # 3. Recommendation Masks (Free vs Paid Courses Masks)
    free_option_ind = user_input[-1]
    free_option_data = categorical_data[:, -1]
    thres_mask = (tfidf_sim > text_thres)
    if free_option_ind == 1:
        free_mask = ((free_option_data == 1) * thres_mask) == 1
    else:
        free_mask = (np.ones(tfidf_data.shape[0]) * thres_mask) == 1
    paid_mask = ((np.ones(tfidf_data.shape[0]) * thres_mask) - free_mask) == 1

    # 4. Apply Masks and Rank by categorical_sim group and rating
    rec_sim, rec_idx = ranking(free_mask, tfidf_sim, categorical_sim, rating_data)

    # 5. Append paid courses if number of free courses below a threshold
    if (free_mask.sum() < free_show_thres) and (paid_mask.sum() > 0):
        paid_sim, paid_idx = ranking(paid_mask, tfidf_sim, categorical_sim, rating_data)
        rec_sim = np.append(rec_sim, paid_sim)
        rec_idx = np.append(rec_idx, paid_idx)
     
    # 6. Convert Index to courseID
    rec_idx = rec_idx + 1
    course_sim = rec_sim[:recommend_topn].tolist()
    course_idx = rec_idx[:recommend_topn].tolist()

    return course_idx


def recommend_default(rating_data):
    sort_idx = (np.argsort(rating_data)[::-1])
    sort_course = sort_idx[:recommend_default_topn]
    default_course = [int(x+1) for x in sort_course]
    return default_course

In [5]:

# CONFIGURATION FOR RECOMMENDER MODULE
# DATA FILE PATH
tfidf_data_filepath = ('Feature Map/tfidf_data.pickle')
categorical_data_filepath = 'Feature Map/categorical_data.pickle'
tfidf_vectorizer_filepath = 'Feature Map/tfidf_vectorizer.pickle'
# TEXT BASED RECOMMENDATION THRESHOLD
text_thres = 0.5
# MINIMUM FREE COURSE COUNT THRESHOLD
free_show_thres = 2
# RECOMMENDATION RESULTS SIZE
recommend_topn = 3
# DEFAULT POPULAR RESULTS SIZE
recommend_default_topn = 3
#multiplier=5


### TESTING

In [6]:

ainput = ['data structures', 1, 0, 1]
rawdata_rating = rawdata['popularity_index']
a=load_pickle(tfidf_data_filepath)
b=load_pickle(categorical_data_filepath)
c=load_pickle(tfidf_vectorizer_filepath)
idx=recommend(ainput,rawdata_rating,c,a,b)
for i in idx:
    print(rawdata['title'][i])

Data Structures - Full Course Using C and C++
Learning Data Structures and Algorithms
Algorithms and Data Structures in Javascript (2020)


In [7]:
ainput = ['machine learning', 1, 0, 1]  # what to learn, difficulty,duration.free
idx=recommend(ainput,rawdata_rating,c,a,b)

for i in idx:
    print(rawdata['title'][i])

Machine Learning using Python Programming
Machine Learning Course for Beginners
Machine Learning for Business


In [8]:
ainput = ['web development', 0, 1, 1]  # what to learn, difficulty,duration.free
idx=recommend(ainput,rawdata_rating,c,a,b)

for i in idx:
    print(rawdata['title'][i])

Introduction to Web Design and Development
Introduction to Web Design and Development
Modern Web Development From Zero To Advanced


In [9]:
ainput = ['data science', 0, 2, 1]  # what to learn, duration,difficulty,.free
idx=recommend(ainput,rawdata_rating,c,a,b)

for i in idx:
    print(rawdata['title'][i])

Understanding Data Science
Zero to Agile Data Science
Introduction To Data Science


In [10]:
ainput = ['web development', 2, 1, 0]  # what to learn, difficulty,duration.free
idx=recommend(ainput,rawdata_rating,c,a,b)

for i in idx:
    print(rawdata['title'][i])

Modern Web Development From Zero To Advanced
Learn Professional Web Development Skills From Scratch -2021
Full Front-End Web Development Course


In [11]:
ainput = ['cloud computing', 2, 3, 0]  # what to learn, difficulty,duration.free
idx=recommend(ainput,rawdata_rating,c,a,b)

for i in idx:
    print(rawdata['title'][i])

TOTAL: Cloud Computing / CompTIA Cloud+ Cert. (CV0-002)
Certified Cloud Security Officer (CCSO)
Learning Path: Microsoft Azure: Cloud Computing and Storage


In [12]:
# # create similarity matrix
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# tfidf = TfidfVectorizer(stop_words="english")
# job_roles=pd.read_csv('data preprocess\preprocessed data\job_skills.csv')
# job_tfidf_matrix = tfidf.transform(job_roles["skill_set"])
# load_pickle(tfidf_data_filepath)
# similarity_matrix = cosine_similarity(job_tfidf_matrix, tfidf_matrix)

# # recommend courses for each job role
# for i, job in enumerate(job_roles["role"]):
#     similar_courses_indices = similarity_matrix[i].argsort()[::-1][:5] # get top 5 most similar courses
#     similar_courses = courses.iloc[similar_courses_indices][["title", "provider", "keywords"]].reset_index(drop=True)
#     print(f"Recommended courses for {job}:")
#     print(similar_courses)
#     print()

# ainput = ['cloud computing', 2, 3, 0]  # what to learn, difficulty,duration.free
# idx=recommend(ainput,rawdata_rating,c,a,b)

# for i in idx:
#     print(rawdata['title'][i])

In [13]:
import csv
with open('job_skills.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        print(row)


['job_role', 'skill_set']
['Data Analyst', 'Statistical analysis,ANOVA,MySQL,SQL,Problem solving,Oracle,Regression analysis,Data manipulation,R,Seaborn,Azure,Power BI,SciPy,Matplotlib,Tableau,PostgreSQL,Hypothesis testing,AWS,Machine learning,Pandas,Data analysis,Data cleaning,Google Analytics,NumPy,SQL Server,Communication skills,Data visualization,Critical thinking,Statistics,Excel,Time series analysis,MongoDB,QlikView,Python,Looker']
['Data Scientist', 'Statistical analysis,Hadoop,SQL,Deep learning,Problem solving,TensorFlow,NLP,R,Azure,Experimentation,Model tuning,Statistical modeling,Scikit-learn,Data wrangling,Machine learning,AWS,Big data,Data analysis,Reinforcement learning,PyTorch,Communication skills,Data visualization,Keras,Spark,Python']
['Software Developer', 'API development,Docker,Windows,Kubernetes,MySQL,SQL,Web development,Problem solving,Git,Oracle,DevOps,Agile,macOS,Data structures,PHP,C#,Flask,JavaScript,React,C++,Mobile app development,Programming,Testing,Spring,Li

In [14]:
job_roles=pd.read_csv('job_skills.csv')
job_roles.head(15)



Unnamed: 0,job_role,skill_set
0,Data Analyst,"Statistical analysis,ANOVA,MySQL,SQL,Problem s..."
1,Data Scientist,"Statistical analysis,Hadoop,SQL,Deep learning,..."
2,Software Developer,"API development,Docker,Windows,Kubernetes,MySQ..."
3,Full Stack Developer,"Semantic UI,API development,MySQL,Git,Oracle,A..."
4,Web Developer,"SQL,Responsive design,Git,Agile,Vue.js,PHP,Jav..."
5,Cloud Solutions Architect,"Data migration,Google Cloud Platform,Networkin..."
6,Cybersecurity Analyst,"Penetration testing,Firewall management,Firewa..."
7,Network Administrator,"Routing protocols,Active Directory,WAN,LAN,VPN..."
8,Network Architect,"Routing protocols,Active Directory,WAN,LAN,VPN..."
9,Database Administrator,"Database backup and recovery,PostgreSQL,MySQL,..."


In [15]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer

# # load job roles dataset
# job_roles_df = pd.read_csv("job_skills.csv")

# # load courses dataset
# courses_df = pd.read_csv("courses_with_keywords.csv")

# # create TF-IDF matrix for courses dataset
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# courses_tfidf = tfidf_vectorizer.fit_transform(courses_df["keywords"])

# # create function to recommend courses for a given job role
# def recommend_courses(job_role):
#     # get skill set for job role
#     skill_set = job_roles_df.loc[job_roles_df["job_role"] == job_role, "skill_set"].values[0].split(",")
    
#     # create TF-IDF vector for skill set
#     skill_set_tfidf = tfidf_vectorizer.transform([" ".join(skill_set)])
    
#     # calculate cosine similarities between skill set vector and courses vectors
#     similarities = courses_tfidf.dot(skill_set_tfidf.T).toarray().flatten()
    
#     # sort courses by similarity score and return top 5
#     top_indices = similarities.argsort()[::-1][:10]
#     return courses_df.loc[top_indices, ["title", "url", "categories", "description_short"]]


In [16]:
# def recommend_courses(job_role, difficulty=0, duration=0, free=1):
#     # get skill set for job role
#     skill_set = job_roles_df.loc[job_roles_df["job_role"] == job_role, "skill_set"].values[0].split(",")
    
#     # create TF-IDF vector for skill set
#     skill_set_tfidf = tfidf_vectorizer.transform([" ".join(skill_set)])
    
#     # calculate cosine similarities between skill set vector and courses vectors
#     similarities = courses_tfidf.dot(skill_set_tfidf.T).toarray().flatten()
    
#     # sort courses by similarity score and return top 10
#     top_indices = similarities.argsort()[::-1][:5]
    
#     # filter courses based on difficulty level
    
    
#     return courses_df.loc[top_indices, ["title", "url", "categories", "description_short", "difficulty", "duration", "free_option"]]


In [17]:
# recommend_courses("Data Analyst", difficulty=1, free=1)

In [18]:
# for job in job_roles['skill_set']:
#     # my_string = job
#     # my_list = my_string.split(",")

   
#     # my_string = " ".join(my_list)
#     # print(my_string)
#     job=job.split(',')
#     for j in job :
#         ainput = [j, 0, 0, 0]  # what to learn, difficulty,duration.free
        
#         idx=recommend(ainput,rawdata_rating,c,a,b)
#         for i in idx:
#             print(j)
#             print(rawdata['title'][i])
#             print()
#     break

In [19]:

# Recommendation module
# Takes user input and returns a sorted list of recommended courses.

# Initialize Library Setup

job_roles_df=pd.read_csv('job_skills.csv')
stop_words = stopwords.words('english')


def text_preprocess_jcr(rawtext):
    text = ', '.join(rawtext)  # Join list elements with a comma and space
    text = re.sub('([^\x00-\x7F])+', '', text)  # Remove all non ASCII characters
    text = text.lower()  # lower casing all words
    text = text.strip()  # Remove White Spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # remove stop words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # lemmatization
    return text
# Utility functions for recommendation module









# 2) Encoding User Input Features:
# Takes list of categorical data (course difficulty, course duration and course free option) as input
# Returns one-hot encoded features.
def categorical_encode(categorical_input):
    encode = np.zeros((1, 6))
    # Binary Encode Course Duration (0 - No Preference, 1 - Short, 2 - Medium, 3 - Long)
    if categorical_input[0] > 0:
        encode[0, categorical_input[0] - 1] = 1
    # Binary Encode Course Difficulty (0 - No Preference, 1 - Introductory, 2 - Intermediate, 3 - Advanced)
    if categorical_input[1] > 0:
        encode[0, categorical_input[1] + 2] = 1
        
    return encode


# 3) TfIdf Vectorizer:
# Takes list of tokens as input and apply TfIdf Vectorization based on the pretrained dictionary.
def tfidf_vectorize(text, vectorizer):
    # Load Tfidf Vectorizer
    # vectorizer_file = open(config.tfidf_vectorizer_filepath, 'rb')
    # vectorizer = pickle.load(vectorizer_file)
    # vectorizer_file.close()
    tfidf = vectorizer.transform([text])
    return tfidf


# 4) Cosine Similarity:
# Takes 2 vectors and calculate cosine similarity
def cond_sim(input_vec, data_vec):
    input_durr = input_vec[:, :3]
    input_diff = input_vec[:, 3:]
    data_durr = data_vec[:, :3]
    data_diff = data_vec[:, 3:6]
    # print(str(input_durr)+ " diff "+str(input_diff)+" data dur"+str(data_durr)+" "+str(data_diff))
    # print('inp vect '+ str(input_vec)+ 'len '+str((input_vec.shape)))
    # print('data vect '+ str(data_vec)+ 'len '+str((data_vec.shape)))
    if (input_diff.sum() + input_durr.sum()) == 0:
        sim = np.ones(data_vec.shape[0])
    elif input_durr.sum() == 0:
        sim = cosine_similarity(input_diff, data_diff)
    elif input_diff.sum() == 0:
        sim = cosine_similarity(input_durr, data_durr)
    else:
        data_vec=data_vec[:,:6]
        sim = cosine_similarity(input_vec, data_vec)
  
    return sim


# 5) Ranking based on popularity index
# Given a sorted and threshold filtered ID of recommendations
# Batch rank for every batch_size of ID by rating.
def ranking(mask, text_sim, categorical_sim, rating):
    target_idx = np.arange(text_sim.shape[0])[mask]
    target_text_sim = text_sim[mask]
    target_categorical_sim = categorical_sim[mask]
    target_rating = rating[mask]
    target_scores = sorted(np.unique(target_categorical_sim), reverse=True)
    rec_idx = np.array([], dtype=int)
    rec_sim = np.array([])
    for score in target_scores:
        group_mask = (target_categorical_sim == score)
        group_idx = target_idx[group_mask]
        group_text_sim = target_text_sim[group_mask]
        group_rating = target_rating[group_mask]
        group_sort_idx = np.argsort(group_rating)[::-1]
        rec_idx = np.append(rec_idx, group_idx[group_sort_idx])
        rec_sim = np.append(rec_sim, group_text_sim[group_sort_idx])
    return rec_sim, rec_idx


# 6) Load-up Pickle Object Data Files
def load_pickle(filename):
    data_file = open(filename, 'rb')
    data = pickle.load(data_file)
    data_file.close()
    return data
# Recommend Function
def recommend_job_role_based(user_input, rating_data, tfidf_vectorizer, tfidf_data, categorical_data):
    # 1. Feature Extraction - Text Based (TfIdf)
    # Load Tfidf Data Sparse Matrix
    # tfidf_data_file = open(config.tfidf_data_filepath, 'rb')
    # tfidf_data = pickle.load(tfidf_data_file)
    # tfidf_data_file.close()
    # Text Input and Similarity Score
        # get skill set for job role
    skill_set = job_roles_df.loc[job_roles_df["job_role"] == user_input[0], "skill_set"].values[0].split(",")
   
    print(skill_set)
    # create TF-IDF vector for skill set
    skill_set_tfidf = tfidf_vectorizer.transform([" ".join(skill_set)])
    
    text_input = skill_set
    #print('text input:  ',text_input)
    text_processed = text_preprocess_jcr(text_input)
    tfidf_vect = tfidf_vectorize(text_processed, tfidf_vectorizer)
    tfidf_sim = cosine_similarity(tfidf_vect, tfidf_data).ravel()
    
  
   
    # 2. Feature Extraction - Categorical Based (One-Hot Encoded)
    # Load Categorical One-Hot Encoded Sparse Matrix

    # categorical_data_file = open(config.categorical_data_filepath, 'rb')
    # categorical_data = pickle.load(categorical_data_file)
    # categorical_data_file.close()
    # Categroical Input and Similarity Score
    categorical_input = user_input[1:3]
    #print('cipt: ',categorical_input)
    
    categorical_vect =categorical_encode(categorical_input)

    categorical_sim = cond_sim(categorical_vect, categorical_data[:, :-1]).ravel()

    # 3. Recommendation Masks (Free vs Paid Courses Masks)
    free_option_ind = user_input[-1]
    free_option_data = categorical_data[:, -1]
    thres_mask = (tfidf_sim > text_thres)
    if free_option_ind == 1:
        free_mask = ((free_option_data == 1) * thres_mask) == 1
    else:
        free_mask = (np.ones(tfidf_data.shape[0]) * thres_mask) == 1
    paid_mask = ((np.ones(tfidf_data.shape[0]) * thres_mask) - free_mask) == 1

    # 4. Apply Masks and Rank by categorical_sim group and rating
    rec_sim, rec_idx = ranking(free_mask, tfidf_sim, categorical_sim, rating_data)

    # 5. Append paid courses if number of free courses below a threshold
    if (free_mask.sum() < free_show_thres) and (paid_mask.sum() > 0):
        paid_sim, paid_idx = ranking(paid_mask, tfidf_sim, categorical_sim, rating_data)
        rec_sim = np.append(rec_sim, paid_sim)
        rec_idx = np.append(rec_idx, paid_idx)
     
    # 6. Convert Index to courseID
    rec_idx = rec_idx + 1
    course_sim = rec_sim[:recommend_topn].tolist()
    course_idx = rec_idx[:recommend_topn].tolist()

    return course_idx


def recommend_default(rating_data):
    sort_idx = (np.argsort(rating_data)[::-1])
    sort_course = sort_idx[:recommend_default_topn]
    default_course = [int(x+1) for x in sort_course]
    return default_course

In [20]:

# CONFIGURATION FOR RECOMMENDER MODULE
# DATA FILE PATH
tfidf_data_filepath = ('Feature Map/tfidf_data.pickle')
categorical_data_filepath = 'Feature Map/categorical_data.pickle'
tfidf_vectorizer_filepath = 'Feature Map/tfidf_vectorizer.pickle'
# TEXT BASED RECOMMENDATION THRESHOLD
text_thres = 0.2
# MINIMUM FREE COURSE COUNT THRESHOLD
free_show_thres = 20
# RECOMMENDATION RESULTS SIZE
recommend_topn = 100
# DEFAULT POPULAR RESULTS SIZE
recommend_default_topn = 50
#multiplier=5
ainput = ['Network Architect', 0, 0, 0]  # what to learn, difficulty,duration.free

rawdata_rating = rawdata['popularity_index']
a=load_pickle(tfidf_data_filepath)
b=load_pickle(categorical_data_filepath)
c=load_pickle(tfidf_vectorizer_filepath)
idx=recommend(ainput,rawdata_rating,c,a,b)      
idx=recommend_job_role_based(ainput,rawdata_rating,c,a,b)
print(len(idx))
for i in idx:
    print(rawdata['title'][i])


['Routing protocols', 'Active Directory', 'WAN', 'LAN', 'VPN', 'Windows Server', 'Problem solving', 'Security', 'TCP/IP', 'DHCP', 'DNS', 'Linux', 'Virtualization', 'Network design', 'Juniper', 'Firewalls', 'Cisco']
100
Azure Active Directory Identity and Access Management Course
CompTIA Network+ (N10-007) Cert Prep: 5 Securing TCP/IP
Basics of BIND DNS Server
Course 3: MCSA Windows Server 2019 Active Directory
Planning for Azure Active Directory
Networking Foundations: Local Area Networks (LANs)
Mastering Modbus TCP/IP Network Communication
Windows Server 2012 R2: Configure Identity and Access Solutions
Windows Server 2019: Install and Configure Active Directory
Windows 7: Networking and Security
Cisco Network Security: VPN
Active Directory & Group Policy Lab 2019
Windows Server 2016 - Practical Guide for Beginners
Networking Foundations: IP Addressing
Linux Administration: Build 5 Hands-On Linux Projects
Linux: Linux Security Masterclass: 3-in-1
Extending On-Prem Active Directory into

In [21]:
jb=pd.read_csv('job_skills.csv')

In [22]:
ainput = ['DevOps Engineer', 0, 0, 0]
      
idx=recommend_job_role_based(ainput,rawdata_rating,c,a,b)
print(len(idx))
    
for i in idx:
            print(rawdata['title'][i])
if len(idx)<50:
    v=jb[jb.job_role=='DevOps Engineer']
    j=v.skill_set.str.split(',')
    for i in j:
        for k in i:
        
        
            ainput = [k, 0, 0, 0]
            idx=recommend(ainput,rawdata_rating,c,a,b)      
            

            print(k)
            for ink in idx[:5]:
               
                print(rawdata['title'][ink])

['Docker', 'Ubuntu', 'Proxy Servers', 'Kubernetes', 'Ansible', 'LVM', 'Git', 'RedHat', 'EC2', 'SSL/TLS', 'Monitoring and logging tools', 'Prometheus', 'Automation tools', 'Azure', 'Linux', 'Bash Shell', 'Cloud computing', 'Networking', 'OpenStack', 'AWS', 'Amazon S3', 'Continuous integration/continuous deployment (CI/CD)', 'Grafana', 'TCP/IP', 'Fedora', 'Containerization technologies', 'Zabbix', 'Load Balancing', 'Perl', 'ELB', 'Firewalls', 'Google Cloud Platform', 'EFS', 'Ruby', 'Route 53', 'Vulnerability Scanning', 'Terraform', 'CentOS', 'Security and compliance', 'ELK Stack', 'Auto Scaling', 'Jenkins', 'DNS', 'Java', 'Scripting languages', 'Penetration Testing', 'Python']
28
DevOps , CI/CD(Continuous Integration/Delivery for Beginners
Amazon EC2 Load Balancers
Devops Tools and AWS for Java Microservice Developers
DevOps Foundations: Continuous Delivery/Continuous Integration
Master CI/CD for Xamarin
DevOps Foundations: Continuous Delivery/Continuous Integration
Continuous Integratio

In [23]:
jbdbd=pd.read_csv('job_skills.csv')
dummies=jbdbd.skill_set.str.get_dummies(",")
  
# display
print(dummies.head(1))

dummies.insert(loc=0, column='job_role', value=jbdbd.job_role)

   .NET  3D modeling and animation  ANOVA  API development  ASP.NET  AWS  \
0     0                          0      1                0        0    1   

   Active Directory  Agile  Agile methodologies  Algorithms  ...  Waterfall  \
0                 0      0                    0           0  ...          0   

   Web development  Web performance optimization  Windows  Windows Server  \
0                0                             0        0               0   

   Wireframing  Wireframing and prototyping tools  Zabbix  jQuery  macOS  
0            0                                  0       0       0      0  

[1 rows x 207 columns]


In [24]:
dummies.head()

Unnamed: 0,job_role,.NET,3D modeling and animation,ANOVA,API development,ASP.NET,AWS,Active Directory,Agile,Agile methodologies,...,Waterfall,Web development,Web performance optimization,Windows,Windows Server,Wireframing,Wireframing and prototyping tools,Zabbix,jQuery,macOS
0,Data Analyst,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Data Scientist,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Software Developer,1,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
3,Full Stack Developer,0,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
4,Web Developer,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [25]:
dummies.to_csv('jobencode.csv', index=False,encoding='utf_8_sig')

In [26]:
jbdbd.head()

Unnamed: 0,job_role,skill_set
0,Data Analyst,"Statistical analysis,ANOVA,MySQL,SQL,Problem s..."
1,Data Scientist,"Statistical analysis,Hadoop,SQL,Deep learning,..."
2,Software Developer,"API development,Docker,Windows,Kubernetes,MySQ..."
3,Full Stack Developer,"Semantic UI,API development,MySQL,Git,Oracle,A..."
4,Web Developer,"SQL,Responsive design,Git,Agile,Vue.js,PHP,Jav..."


In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# user skill set
user_skills = "Python,CSS,HTML,sql"

# job required skills
job_skills = jbdbd.skill_set

# text preprocessing
user_skills = user_skills.lower().replace(",", " ")
job_skills = [skills.lower().replace(",", " ") for skills in job_skills]

# TF-IDF vectorization
tfidf = TfidfVectorizer()
job_skills_tfidf = tfidf.fit_transform(job_skills)
user_skills_tfidf = tfidf.transform([user_skills])

# cosine similarity
similarity_matrix = cosine_similarity(user_skills_tfidf, job_skills_tfidf)

# create a dataframe with job roles and their similarity scores
jobs_df = pd.DataFrame({'Job Role': jbdbd.job_role,
                        'Similarity': similarity_matrix[0]})

# sort by similarity score in descending order
jobs_df = jobs_df.sort_values(by='Similarity', ascending=False)

# print the dataframe
print(jobs_df)


                     Job Role  Similarity
4               Web Developer    0.314730
3        Full Stack Developer    0.177508
13             UI/UX Designer    0.142889
2          Software Developer    0.112021
0                Data Analyst    0.109152
1              Data Scientist    0.080037
9      Database Administrator    0.053766
10            DevOps Engineer    0.031166
5   Cloud Solutions Architect    0.000000
6       Cybersecurity Analyst    0.000000
7       Network Administrator    0.000000
8           Network Architect    0.000000
11            Systems Analyst    0.000000
12             Game Developer    0.000000
14                 IT Manager    0.000000


In [28]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file
df = pd.read_csv('jobencode.csv', index_col='job_role')

# Compute the cosine similarity matrix
cos_sim = cosine_similarity(df)

# Get the indices of the most similar job roles
job_indices = pd.Series(df.index)

def get_similar_jobs(job_title, n=3):
    # Find the index of the job title
    idx = job_indices[job_indices == job_title].index[0]

    # Get the cosine similarities for the job
    sim_scores = list(enumerate(cos_sim[idx]))

    # Sort the jobs by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar jobs
    sim_indices = [i[0] for i in sim_scores[1:n+1]]

    # Return the most similar job titles
    return list(job_indices.iloc[sim_indices])

# Example usage:
similar_jobs = get_similar_jobs('Data Analyst', n=3)
print(similar_jobs)



['Data Scientist', 'Database Administrator', 'Software Developer']
