In [None]:
# This Jupyter Notebook uses NLP and unsupervised learning to classify jobs based on their job descrition
# The data from this project comes from a publicly available CSV on Kaggle
# Kaggle URL: https://www.kaggle.com/new-york-city/new-york-city-current-job-postings?select=nyc-jobs.csv

In [None]:
# This code block imports python libraries

from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from collections import defaultdict
import time
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import pickle
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
%matplotlib inline
import nltk
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import random

In [None]:
# This code block reads the CSV file and renders the data in a Pandas Dataframe

# To create a dataframe with this data, download the CSV file from the Kaggle URl referenced at the top
# Replace the file path in the line below with your own CSV file path
NYC_Jobs_DF = pd.read_csv('/Your_File_Path/Folder_With_This_CSV/nyc_jobs.csv')

#display(NYC_Jobs_DF)

In [None]:
# Note: There are duplicates in this data set (i.e. more than 1 line of data having the same job description)
# If there were a few jobs with huge numbers of duplicates it could have an effect on the clustering:

# Example: A job with 100 duplicates in a dataset where each other job has only 1 row would strongly tend to keep 
#          the centroid of its cluster near iself.  If the repeat job description is such that is should be at the 
#          edge of a sparse cluster, it will increase the probability of excluding jobs on the opposite side of
#          its cluster, while incorporating jobs that should be on the edge of neighboring clusters.

# This effect is not so strong for this data set, as most duplicated jobs have only 2 rows, and the job with the
# most duplications has 14.  Thus, we can have reasonable clustering without first removing the duplicates.
# (Though it is possible that duplicates are helping keep the Taxi and Limosine commission jobs in their own group)

# The effect will tend to be increased if one or a few entries have many duplicates (as opposed to duplication 
# being spread out).  The effect also would tend to increase as the number of clusters goes up and tends to 
# decrease as the size of the data set increases.

# A version without duplicates might be uploaded onto GitHub later for comparison

In [None]:
# This code block checks for information on duplicates
# Specifically, it checks for dulicates there are which occur above a certain frequency level (set by the user)

# This code block is really for EDA, so you do not actually need to run it to proceed to the rest of the analysis
# But you can if you want to, I won't stop you

# (I don't recommend it personally, since you will just be looking through a lot of tedious stuff, but whatever, you do you)

#-------------------------------------------------------------------------------------------------------------

Minimum_Times_Duplicated_To_Display = 3 # This variable establishes the minimum times an entry must repeat to show
# For example, if the value is 3, only job descriptions repeated 3 or more times will be shown when this code runs
# If you lower it to 2 there will be many more results that will indicate they occurred twice in this data set
# This code will not display any job description that only appears once, even if you set the value to 1 or less
# This only counts exact matches as duplicates, not merely close jobs


First_Occurance_List = []
Duplicate_Occurances_Dict = {}

Index = 0

while(Index < len(NYC_Jobs_DF)):

    Current_Item = NYC_Jobs_DF['Job Description'][Index]

    if(Current_Item in First_Occurance_List):
        if(Current_Item in Duplicate_Occurances_Dict):
            Duplicate_Occurances_Dict[Current_Item] += 1
        else:
            Duplicate_Occurances_Dict[Current_Item] = 2
    else:
        First_Occurance_List.append(Current_Item)

    Index += 1
    
Number_Results_Showing = 0

for Description in Duplicate_Occurances_Dict:

    if(Duplicate_Occurances_Dict[Description] >= Minimum_Times_Duplicated_To_Display):

        print(" ")
        print("This job description occurs", Duplicate_Occurances_Dict[Description], "times in this data set:")
        print(" ")
        print(Description)
        print(" ")
        print("-----------------------------------------------------------------------------------------")
        Number_Results_Showing += 1
        
    if(Minimum_Times_Duplicated_To_Display < 3):
        time.sleep(0.01) # This is needed to avoid exceeding the data rate limit for Jupyter notebooks
        
if(Number_Results_Showing == 1):
    print("Showing 1 result")
else:
    print("Showing", Number_Results_Showing, "results")

In [None]:
# This code block creates a function to replace certain punctuation in a text string with spaces (" ")
# This is needed because job descriptions will later be broken into words by splitting on space characters

# The punctuation to be removed are characters that would be expected to be at the beginning or end of a word
# Example: a sentence that ends "...this job." should count "job." as the same word as "job" occurring mid-sentence

# Apostrophes are allowed to remain in the string, since they would tend to occur in the middle of a word
# Example: supose the phrase "the administrator's role" occurs 
# This should not count as one instance of the word "administrator" plus one instance of the word "s"

def replace_punctuation_with_spaces(input_string):
    
    Punctuation_List = [",", ":", "-", ";", ".", "!", "?", "/"]
                        
    Output_String = ""
    
    Index = 0
                        
    for char in input_string:
        if char in Punctuation_List:
            Output_String += " "
        elif(char == "'\'"):
            Output_String += " "
        else:
            Output_String += char
            
    return Output_String

In [None]:
# This code block creates a function to convert strings to uppercase letters only
# It also allows apostrophes in the original string to remain

def uppercase_letters_and_apostrophes(input_string):
    
    Capital_List = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
                    "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "'"]
    
    Lowercase_List = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
                      "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
    
    Output_String = ""
    
    Letter_Index = 0
    
    for char in input_string:
        if(char in Capital_List):
            Output_String += char
        elif(char in Lowercase_List):
            Letter_Index = Lowercase_List.index(char)
            Output_String += (Capital_List[Letter_Index])
            
    return Output_String

In [None]:
# This code block creates a function to convert strings to uppercase letters only
# It also allows spaces and apostrophes in the original string to remain

def uppercase_letters_spaces_and_apostrophes(input_string):
    
    Capital_List = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
                    "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "'", " "]
    
    Lowercase_List = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
                      "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
    
    Output_String = ""
    
    Letter_Index = 0
    
    for char in input_string:
        if(char in Capital_List):
            Output_String += char
        elif(char in Lowercase_List):
            Letter_Index = Lowercase_List.index(char)
            Output_String += (Capital_List[Letter_Index])
            
    return Output_String

In [None]:
# This code block creates a list of jobs descriptions capitalized and with most punctuation removed

List_of_Clean_Job_Descriptions = []

Index = 0

while(Index < len(NYC_Jobs_DF)):
    
    Description_String = replace_punctuation_with_spaces(NYC_Jobs_DF['Job Description'][Index])
    List_of_Words_In_This_Description = list(Description_String.split(" "))
    
    List_of_Clean_Job_Descriptions.append(uppercase_letters_spaces_and_apostrophes(Description_String))
    
    Index += 1

In [None]:
# This code block uses the list created in the previous block to create a new column in the dataframe

NYC_Jobs_DF['Job Description Cleaned'] = List_of_Clean_Job_Descriptions

In [None]:
# This code block creates a word stemmer so, for example, "inspector" and "inspectors" will count as the same word

Stemmer_1 = LancasterStemmer()

In [None]:
# This code block applies the stemmer to the job descriptions

List_Of_Stemmed_Descriptions = []

Index = 0

while(Index < len(List_of_Clean_Job_Descriptions)):
    
    Desc = List_of_Clean_Job_Descriptions[Index]
    Desc_List = list((Desc).split(" "))
    
    for word in Desc_List:
        word = Stemmer_1.stem(word)
        
    List_Of_Stemmed_Descriptions.append(Desc)
    
    Index += 1

In [None]:
# This code block creates a column in the dataframe for the job descriptions that have had the stemmer applied

NYC_Jobs_DF['Job Description Stemmed Cleaned'] = List_Of_Stemmed_Descriptions

In [None]:
# This code block shows an example of a pre-processed job description
# This can be compared to the same one after processing in the code block below
# To view this description, de-commentify the code line below

NYC_Jobs_DF['Job Description'][35]

In [None]:
# This code block shows an example of a processed job description
# This can be compared to the same one from before processing in the code block above
# To view this description, de-commentify the code line below

NYC_Jobs_DF['Job Description Stemmed Cleaned'][35]

In [None]:
# This code block creates a TFIDF Vectorizer
# This will be used to compare similarities in word usage in jobs descriptions
# This method will also correct for differences in length between different job descriptions
# The default (sum of squares) method is used here, which leaves all normalized vectors the same length
# Correcting for length will be important later as jobs will judged similar or not based on Euclidean Distance
# In the absence of corrections for length, it would be better to use Cosine Distance

TFDIF_1 = TfidfVectorizer(stop_words='english')
TFDIF_1

In [None]:
# This code block creates a sparse matrix of the tokenized words from the TFDIF Vectorizer
# The sparse matrix keeps track of the frequency of word usage for each domcument

X_Full = TFDIF_1.fit_transform(NYC_Jobs_DF['Job Description Stemmed Cleaned'])

In [None]:
# This code block creates a Dataframe of the TFIDF-adjusted job description vectors
# Each tokenized word is a column in the dataframe 
# Each tokenized word will be represented as a dimension in Euclidean space for K-Means clustering

TFIDF1_DF_Full = pd.DataFrame(X_Full.toarray(),columns=TFDIF_1.get_feature_names())

In [None]:
# The next code block may take a long time to run, as it is performing 13 iterations of K-Means Clustering
# This was used to find the number of clusters that had the highest silhouette score

# You can skip the next 3 code blocks to save time (the 2 after depent on the output of the next block)

In [None]:
# This code block uses K-Means clustering to group the jobs that have the most similar descriptions
# This block performs 13 clusterings, with the number of groups ranging from 2 to 14
# This block also scores each custering using the silhouette_score for comparison

Current_Cluster_Number = 2

Silhouette_Score_List = []

while((Current_Cluster_Number < 15)):

    k_means_cluster_loop_full_tfidf = KMeans(n_clusters=Current_Cluster_Number, random_state=11)
    k_means_cluster_loop_full_tfidf.fit(TFIDF1_DF_Full)
    
    Cluster_Labels = k_means_cluster_loop_full_tfidf.labels_
    
    Silhouette_Score = silhouette_score(TFIDF1_DF_Full, Cluster_Labels, metric='euclidean', sample_size=None, random_state=None)
    
    Silhouette_Score_List.append(Silhouette_Score)
    
    Current_Cluster_Number += 1

In [None]:
# If you want to see the Silhouette Scores in list form, de-commentify the print statement at the bottom

print(Silhouette_Score_List)

In [None]:
# This code block shows a graph of the Silouette Scores for each cluster size

K_Numbers = [2,3,4,5,6,7,8,9,10,11,12,13,14]
plt.bar(K_Numbers, Silhouette_Score_List);
plt.xlabel('Number of Clusters')
plt.ylabel('Average Silhouette Score')

In [None]:
#----------------------------------------------------------------------------------------------------------------

# NOTE: I obtained my results for silhouette score, and subsequent category generation, by running this code on
# a MacBook Pro running MacOS Mojave version 10.14.6, and with Python 3.7 - using the given (pseudo) random state.

# If you use a different computer, operating system, or Python version, you may get different results even using
# the same random state number.

#----------------------------------------------------------------------------------------------------------------

In [None]:
# This code block uses the number of clusters which yielded the highest Silhouette score to generate labels

k_means_cluster_full_tfidf = KMeans(n_clusters=12, random_state=11)
k_means_cluster_full_tfidf.fit(TFIDF1_DF_Full)

In [None]:
# This code block creates seperate lists for job descriptions of each category of job
# It also creates seperate lists for the index (row) number of each category
# Both types of lists are updated in the same order and the lists are not altered later
# This allows for the use of index numbers to search only for other jobs in the same category

# Labels of "0" through "11" are automatically generated for each job by the model to indicate job's group

Index = 0

List_0 = [] # There lists will hold the job description vectors
List_1 = []
List_2 = []
List_3 = []
List_4 = []
List_5 = []
List_6 = []
List_7 = []
List_8 = []
List_9 = []
List_10 = []
List_11 = []

Ind_List_0 = [] # There lists will hold the Index numbers, in the same order as the vectors
Ind_List_1 = []
Ind_List_2 = []
Ind_List_3 = []
Ind_List_4 = []
Ind_List_5 = []
Ind_List_6 = []
Ind_List_7 = []
Ind_List_8 = []
Ind_List_9 = []
Ind_List_10 = []
Ind_List_11 = []

while(Index < len(k_means_cluster_full_tfidf.labels_)):
    
    if(k_means_cluster_full_tfidf.labels_[Index] == 0): # Jobs assigned label 0 appended to these lists
        List_0.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_0.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 1): # Jobs assigned label 1 appended to these lists
        List_1.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_1.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 2): # Etc
        List_2.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_2.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 3):
        List_3.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_3.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 4):
        List_4.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_4.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 5):
        List_5.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_5.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 6):
        List_6.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_6.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 7):
        List_7.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_7.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 8):
        List_8.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_8.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 9):
        List_9.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_9.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 10):
        List_10.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_10.append(Index)
    elif(k_means_cluster_full_tfidf.labels_[Index] == 11):
        List_11.append(NYC_Jobs_DF["Job Description"][Index])
        Ind_List_11.append(Index)
    
    Index += 1

# The category names in the comments next to each list were assigned based on spot checking of each list's jobs
# If you want to see the number of jobs in each category, de-commentify the print statements below

                    # List # : Types of jobs
                    #----------------------------------------------------------------
                           #:
#print(len(List_0)) # List_0: Public Works Inspection, Compiance, and Quality Assurance
#print(len(List_1)) # List_1: Miscellaneous
#print(len(List_2)) # List_2: Administrative, Finance, Budget, and Analysis
#print(len(List_3)) # List_3: Housing Administration and Project Management
#print(len(List_4)) # List_4: Health Services, Education, and Family/Youth/Child Welfare
#print(len(List_5)) # List_5: Water Quality Assurance and Wastewater Facilities Inpection/Admin
#print(len(List_6)) # List_6: Law/Legal
#print(len(List_7)) # List_7: Procurement, Contracting, and Planning
#print(len(List_8)) # List_8: Engineering, Construction, and Safety Inspections 
#print(len(List_9)) # List_9: Taxi and Limousine Commision
#print(len(List_10)) # List_10: City Planning and Land Use/Zoning
#print(len(List_11)) # List_11: Investigations and Record Review

In [None]:
# This code block creates a list of lists for both the job vector lists and the index numbers
# The order is preserved such that, for any integers X and Y (where X < 12 and Y < (length of this dataframe)):

# List_Of_K_Clusters[X][Y] refers to the job on the same row as List_Of_Indexes[X][Y]

# In addition, the (0 to 11) labels assigned by the models match the index numbers of their corresponding lists

List_Of_K_Clusters = [List_0, List_1, List_2, List_3, List_4, List_5,
                      List_6, List_7, List_8, List_9, List_10, List_11]

List_Of_Indexes = [Ind_List_0, Ind_List_1, Ind_List_2, Ind_List_3, Ind_List_4, Ind_List_5,
                   Ind_List_6, Ind_List_7, Ind_List_8, Ind_List_9, Ind_List_10, Ind_List_11]

In [None]:
# To get a sample of each category, you can run this code block
# You can adjust the Samples_Per_Category parameter to change the number of jobs seen from each cluster

# This is to enable human-understandable category labelling, not something necessary for later code blocks to work
# You do not need to run it to continue with the analysis, but you can if you want

# -----------------------------------------------------------------------------------------------------

Meta_Index = 0
Samples_Per_Category = 6 # Adjust this to alter the number of jobs in each category you sample

while(Meta_Index < len(List_Of_K_Clusters)):
    
    Current_List = List_Of_K_Clusters[Meta_Index]
    Increment = int((len(Current_List)//Samples_Per_Category))
    Index = 0
    Jobs_Printed = 0
    
    print(" ")
    print("----------------------------------------------------------------------------------------")
    print(" ")
    print("                      CATEGORY NUMBER: ", Meta_Index)
    print(" ")
    print("----------------------------------------------------------------------------------------")
    print(" ")
    
    while(Jobs_Printed < Samples_Per_Category):
        
        print(" ")
        print(Current_List[Index])
        print("----------------------------------------------------------------------------------------")
        print(" ")
        Index += Increment
        Jobs_Printed += 1
                    
    Meta_Index += 1

In [None]:
# This code block creates a function to enable a user to input a job and see similar jobs from the same category
# By default it prints 5 results and excludes exact duplicates, though a user can change this

def most_similar_in_this_category(Test_Case, number_of_results_requested=5, allow_exact_duplicates=False, 
                                  allow_results_outside_category=False):
    
    """This function takes a TFIDF-transformed vector from a job desciption and returns 
    the original text of other job descriptions that are similar based on Euclidean 
    distance.
    
    Parameters
    ----------
    
    Test_Case : Pandas Series with shape (10579,) which corresponds to the shape 
        of description vectors for this data set
                        
    number_of_results_requested : Integer (default value 5) determines the number of 
        results that will be returned
                                 
    allow_exact_duplicates : Boolean (default value False) determines whether to 
        allow results that are verbatim identical to the input or to each other
                            
    allow_results_outside_category : Boolean (default value False) determines whether 
        to allow results that outside the cluster of the input job vector
                                    
                                    """

    Closest = []
    Min_Dist_List = []
    Exclude_List = []
    
    if(allow_results_outside_category == False): # This is if you are only searching within the same cluster
        
        Meta_Index = (k_means_cluster_full_tfidf.predict([Test_Case]))[0]
        List_To_Search = List_Of_K_Clusters[Meta_Index] # This refers the program to the list for the input
        Index_List_To_Search = List_Of_Indexes[Meta_Index]

        Index = 0

        while(Index < len(List_To_Search)):

            TFIDF_Index = Index_List_To_Search[Index] 

            Element = TFIDF1_DF_Full.iloc[TFIDF_Index]

            if((allow_exact_duplicates == False) and ((Element == Test_Case).all())):
                pass # Indicates a duplicate to be excluded
            elif(NYC_Jobs_DF["Job Description"][TFIDF_Index] in Exclude_List):
                pass # Indicates a duplicate to be excluded
            else:

                Euclidean_Distance = np.linalg.norm(Test_Case - Element)

                if(len(Closest) < number_of_results_requested): # This part adds a result if the minimum number of results has not been reached yet
                    Closest.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                    Min_Dist_List.append(Euclidean_Distance)
                    if(allow_exact_duplicates == False):
                        Exclude_List.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                elif(Euclidean_Distance < max(Min_Dist_List)): # If the minimum number of results has been reached, this part checks if the current result is a closer match 
                    Remove_Index = Min_Dist_List.index(max(Min_Dist_List)) # If the current result is a closer match, it replaces the most distant match on the list
                    Closest.pop(Remove_Index)
                    Min_Dist_List.pop(Remove_Index)
                    Closest.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                    Min_Dist_List.append(Euclidean_Distance)
                    if(allow_exact_duplicates == False):
                        Exclude_List.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])

            Index += 1
            
    else: # This is if you are only searching the entire data set for matches, not just within the same cluster     
          # For this data set, that categories are somewhat artificial, and can change based random state starting conditions
          # Thus, it may be advantageous to search the whole data set, (since this data set also is not that big)
          # However, if you use something like this on a much larger data set with more clear categories, it can
          # save a lot of time to have the data pre-clustered and search only with the input's group
          
        TFIDF_Index = 0
        
        while(TFIDF_Index < len(NYC_Jobs_DF)):

            Element = TFIDF1_DF_Full.iloc[TFIDF_Index]

            if((allow_exact_duplicates == False) and ((Element == Test_Case).all())):
                pass
            elif(NYC_Jobs_DF["Job Description"][TFIDF_Index] in Exclude_List):
                pass
            else:

                Euclidean_Distance = np.linalg.norm(Test_Case - Element)

                if(len(Closest) < number_of_results_requested):
                    Closest.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                    Min_Dist_List.append(Euclidean_Distance)
                    if(allow_exact_duplicates == False):
                        Exclude_List.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                elif(Euclidean_Distance < max(Min_Dist_List)):
                    Remove_Index = Min_Dist_List.index(max(Min_Dist_List))
                    Closest.pop(Remove_Index)
                    Min_Dist_List.pop(Remove_Index)
                    Closest.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
                    Min_Dist_List.append(Euclidean_Distance)
                    if(allow_exact_duplicates == False):
                        Exclude_List.append(NYC_Jobs_DF["Job Description"][TFIDF_Index])
            
            TFIDF_Index += 1
        
    if(len(Closest) < number_of_results_requested):
        print("We could only find", (len(Closest)), "other jobs in this category.")
        
    Results_Printed = 0

    for Desc in Closest:
        
        Results_Printed += 1
        print(" ")
        print("Result Number", Results_Printed)
        print(" ")
        print(Desc)
        print(" ")
        print("--------------------------------------------------------------------------------------------------------")        

In [None]:
# This code block creates a test case for the function

Test_Case_1 = TFIDF1_DF_Full.iloc[29]

In [None]:
# This code block uses the most_similar_in_this_category() function to find similar jobs to the test case job

most_similar_in_this_category(Test_Case_1)

In [None]:
# This code block views the job description that corresponds to test case
# This allows users to compare the results with the input to see how well they match

NYC_Jobs_DF["Job Description"][29]

In [None]:
# This code block creates a function to choose a job at random and then get results of jobs similar to it
# This allows users to check performance of the most_similar_in_this_category() function on a range of jobs

def pick_random_job_and_check_for_similar(show_category=False):
    
    Job_Index = int((len(NYC_Jobs_DF) * random.random())//1)
    
    if(show_category == True):
        
        Category = k_means_cluster_full_tfidf.labels_[Job_Index]
        print(" ")
        print("        CATEGORY NUMBER: ", Category)
        print(" ")
        print("--------------------------------------------------------------------------------------------------------")
    
    print(" ")
    print("Random job selected to find matches for:")
    print(" ")
    print(NYC_Jobs_DF["Job Description"][Job_Index])
    print(" ")
    print("--------------------------------------------------------------------------------------------------------")  
    
    most_similar_in_this_category(TFIDF1_DF_Full.iloc[Job_Index])

In [None]:
# You can call this function just by running the code block, it does require take any arguments
# However, you can optionally show the category number by putting "show_category=True" in the parenthesis

pick_random_job_and_check_for_similar()