In [15]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Colab Notebooks/preprocess'
data_path = f'/content/{root_path}/DATA/'
model_path = f'/content/{root_path}/models/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
import numpy as np
import pandas as pd
import spacy
# import gensim
import re
from ast import literal_eval

from IPython.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')




('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f35978e36a8>)

In [0]:
data = pd.read_csv(f'{data_path}career.csv')
data.dropna(inplace=True)

In [19]:
data.head()


Unnamed: 0,id,title,question_body,tags,answers,scores,url
5,200717,How difficult is computer networking,I am a student who is a computer nerd somewhat...,|computer|information-technology|computer-netw...,Computer networking defines rules on how a com...,0,https://www.careervillage.org/questions/200717...
6,200716,how hard is computer networking/coding?,i like computers a lot and was just wondering ...,|computer-science,Coding can be easy/ hard depending on your apt...,0,https://www.careervillage.org/questions/200716...
7,200713,What steps would you recommend I take to prepa...,I'm a student. I'm looking for the information...,|healthcare,Here in NJ we have specialized Medical Assista...,1,https://www.careervillage.org/questions/200713...
9,200389,How would I go about becoming a welder,I like to do hands on stuff and be active #wel...,|welding|welder,Join the union and go to their welding classes...,2,https://www.careervillage.org/questions/200389...
10,200439,How much does a construction worker make?,I go to a program where I make stuff and learn...,|construction,"What do you do in ""construction""? Salary depe...",3,https://www.careervillage.org/questions/200439...


In [0]:
def nlp_preprocessing(data):
    """ Use NLP to transform the text corpus to cleaned sentences and word tokens

        param data: List with sentences, which should be processed.
        return processed_tokens: List with the cleaned and tokenized sentences
    """    
    def token_filter(token):
        """ Keep tokens who are alphapetic, in the pos (part-of-speech) list and not in stop list
            
        """    
        return not token.is_stop and token.is_alpha and token.pos_ in ['NOUN', 'VERB', 'PROPN', 'ADJ', 'INTJ', 'X']
    
    data = [re.compile(r'<[^>]+>').sub('', x) for x in data] #Remove HTML-tags
    processed_tokens = []
    data_pipe = nlp.pipe(data)
    for doc in data_pipe:
        filtered_tokens = [token.lemma_.lower() for token in doc if token_filter(token)]
        processed_tokens.append(filtered_tokens)
    return processed_tokens

In [0]:
data['questions_full_text'] = data['title'] +'\r\n\r\n'+ data['question_body']

In [0]:
data['tags'] = data['tags'].apply(lambda x: [i for i in x.split('|') if i])

In [0]:
data['nlp_tokens'] = nlp_preprocessing(data['questions_full_text'])

In [24]:
data.head()

Unnamed: 0,id,title,question_body,tags,answers,scores,url,questions_full_text,nlp_tokens
5,200717,How difficult is computer networking,I am a student who is a computer nerd somewhat...,"[computer, information-technology, computer-ne...",Computer networking defines rules on how a com...,0,https://www.careervillage.org/questions/200717...,How difficult is computer networking\r\n\r\nI ...,"[difficult, computer, network, student, comput..."
6,200716,how hard is computer networking/coding?,i like computers a lot and was just wondering ...,[computer-science],Coding can be easy/ hard depending on your apt...,0,https://www.careervillage.org/questions/200716...,how hard is computer networking/coding?\r\n\r\...,"[hard, computer, networking, coding, like, com..."
7,200713,What steps would you recommend I take to prepa...,I'm a student. I'm looking for the information...,[healthcare],Here in NJ we have specialized Medical Assista...,1,https://www.careervillage.org/questions/200713...,What steps would you recommend I take to prepa...,"[step, recommend, prepare, medical, assistant,..."
9,200389,How would I go about becoming a welder,I like to do hands on stuff and be active #wel...,"[welding, welder]",Join the union and go to their welding classes...,2,https://www.careervillage.org/questions/200389...,How would I go about becoming a welder\r\n\r\n...,"[welder, like, hand, stuff, active, weld, welder]"
10,200439,How much does a construction worker make?,I go to a program where I make stuff and learn...,[construction],"What do you do in ""construction""? Salary depe...",3,https://www.careervillage.org/questions/200439...,How much does a construction worker make?\r\n\...,"[construction, worker, program, stuff, learn, ..."


In [0]:
data = pd.read_csv(f'{data_path}new_career.csv')

In [0]:
def search(questions_text, data=data, threshold=0.01, top=5):
    """ Calculates the similarity to the existing questions and returns the most similar ones.
        
        :param questions_text: List with text of new questions to get the similiarities
        :param threshold: Threshold to filter out all questions with similarity below the value (default: 0.01)
        :param top: Top N similar questions (default: 5)
        :param answered: Consider only answered questions (default: True)
        :return result: DataFrame with the top similar questions
    """ 
    data['nlp_tokens'] = data['nlp_tokens'].apply(lambda x: literal_eval(x))
    nlp_corpus = [' '.join(x) for x in data['nlp_tokens']]
    nlp_text = [' '.join(x) for x in nlp_preprocessing(questions_text)]
    vectorizer = TfidfVectorizer()
    vectorizer.fit(nlp_corpus)
    corpus_tfidf = vectorizer.transform(nlp_corpus)
    
    text_tfidf = vectorizer.transform(nlp_text)
    sim = cosine_similarity(corpus_tfidf, text_tfidf)
    result = pd.DataFrame({'url':np.tile(data['url'], sim.shape[1]),
                           'title':np.tile(data['title'], sim.shape[1]),
                           'similarity':np.round(sim.reshape(-1,),2),
                          'scores':np.tile(data['scores'], sim.shape[1])},
                         index=np.tile(data.index, sim.shape[1]))


    result = result[result['similarity'] >= threshold].sort_values('similarity', ascending=False).head(top).reset_index(drop=True)
    output = ""
    search_results = []
    for index, row in result.iterrows():
      temp = {'url': str(row.url),
                'title': row.title,
                'similarity': str(row.similarity),
                'CareerVillage Score': str(row.scores)}

      search_results.append(temp)
    print(search_results)
    #   output += '<a target="_blank" href='+ str(row.url)+'><h2>' + row.title + '</h2></a>'
    #   output += '<h3> Similarity Score: ' + str(row.similarity) + '</h3>'
    #   output += '<h3> CareerVillage Scores: ' + str(row.scores) + '</h3>'
    #   output +='<p style="font-family:verdana; font-size:110%;"> '

    # output = '<h3>Results:</h3>'+output
    # display(HTML(output))

In [10]:
search(['data science'], data)

[{'url': 'https://www.careervillage.org/questions/152502/how-to-learn-data-science', 'title': 'how to learn data science', 'similarity': '0.81', 'CareerVillage Score': '3'}, {'url': 'https://www.careervillage.org/questions/156182/hello-i-am-a-msc-student-in-big-data-analytics-and-i-would-like-to-learn-more-about-data-science-career-options', 'title': 'hello, i am a Msc student in Big data & analytics and i would like to learn more about  data science career options', 'similarity': '0.66', 'CareerVillage Score': '5'}, {'url': 'https://www.careervillage.org/questions/153517/where-to-get-a-data-science-job', 'title': 'Where to get a Data Science job?', 'similarity': '0.65', 'CareerVillage Score': '3'}, {'url': 'https://www.careervillage.org/questions/154432/what-is-the-scope-of-data-science-program', 'title': 'what is the scope of data science program', 'similarity': '0.64', 'CareerVillage Score': '6'}, {'url': 'https://www.careervillage.org/questions/154267/how-do-i-get-an-internship-in-