In [3]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import string
from wordcloud import WordCloud
from textblob import TextBlob
import seaborn as sns

In [4]:
df = pd.read_csv('../data/jobs.csv')
df.head()

Unnamed: 0,country,title,text
0,AT,Inform,https://www.linkedin.com/company/global-blue/l...
1,AT,Architect,https://www.linkedin.com/jobs/view/2589036509/...
2,AT,Manager,www.linkedin.com/jobs/view/2540581439/\r\n\r\n...
3,AT,Analyst,https://www.linkedin.com/jobs/view/2997674064\...
4,AT,Engineer,https://www.linkedin.com/jobs/view/3020920801\...


In [5]:
# Removal of links
df['text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

# Extract Job Title from text
df['job_title'] = df['text'].apply(lambda x: (re.sub(r'^.*?\n', '\n', x).strip()).split('\n')[0] )


def remove_Stopwords(text ):
    stop_words = set(stopwords.words('english')) 
    words = word_tokenize( text.lower() ) 
    sentence = [w for w in words if not w in stop_words]
    return " ".join(sentence)
    

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    for sentence in sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    return ' '.join(wordlist) 

def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr]) 
    
    return text2.lower()

df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].apply(remove_Stopwords)
df['text'] = df['text'].apply(lemmatize_text)

df.head()

Unnamed: 0,country,title,text,job_title
0,AT,Inform,information technology security manager global...,Information Technology Security Manager\r
1,AT,Architect,information security architect copmany swarovs...,Information Security Architect\r
2,AT,Manager,information technology security manager copman...,Information Technology Security Manager\r
3,AT,Analyst,threat detection analyst company radar cyber s...,Threat Detection Analyst\r
4,AT,Engineer,security engineer company global blue location...,IT Security Engineer\r


In [10]:
# print(TextBlob(df['text'][0]).ngrams(2)[:5])

In [13]:
def readFile():
    """
    This function will read the text files passed & return the list
    """
    # fileObj = open(fileName, "r") #opens the file in read mode
    words = df['text'][0].splitlines() #puts the file into a list
    # fileObj.close()
    return words

def read_nGrams():
    """
    This function will read bigrams & trigrams and 
    return combined list of bigrams & trigrams.
    """
    # read  bigrams 
    original_bigram = readFile()
    # read trigrams
    # original_trigram = readFile("trigram.txt")

    # Combined list of bigrams & trigrams
    # n_grams_to_use = []
    # n_grams_to_use.extend(original_bigram)
    # n_grams_to_use.extend(original_trigram)
    # return n_grams_to_use
    
    return original_bigram
n_grams_to_use = read_nGrams()

In [24]:
# split each n-gram into separate words
def split_nGrams(n_grams_to_use):
    ngrams_splited = [each.split() for each in n_grams_to_use]
    return ngrams_splited
ngrams_splited = split_nGrams(n_grams_to_use)
len(ngrams_splited[0])

732

In [None]:
def average_word_vectors(list_words, model, vocabulary, num_features):
    """
    This function will take each tokenized sentence having bigrams or trigrams, 
    model = the mapping_of_word_to_vector dictionary, vocabulary = unique set of keys(words) present in model,
    num_features = 50
    
    This function will return the average of feature vector for each word present in list_words.
    """
    # Created array of zeros (type float) of size num_features, i.e., 50.
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    # Put it in try block so that if any exception occur, it will be dealt by below exception block.
    try:
        # Check if word is in passed list_of_words or not.
        for word in list_words:
            # Check if word is in general vocabulary or not (the unique set of words in word embedding).
            if word in vocabulary: 
                # Increment number_of_words
                nwords = nwords + 1
                # add vector array of corresponding key in model which matches the passed word.
                feature_vector = np.add(feature_vector, model[word])

        if nwords:
            # Take average of feature_vector by dividing with total number of words
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector
    
    except:
        # If the exception occurs, while the word isn't found in vocabulary, it will return the array of zeros
        return np.zeros((num_features,),dtype="float64")
    

    
def averaged_word_vectorizer(corpus, model, num_features):
    """
    This function is taking corpus of bigrams & trigrams, w2v mappings, num of features as a input arguments.
    and returning array of features after taking average using average_word_vectors() function.
    """
    # Get the unique keys out of word_to_vector_map dictionary.
    vocabulary = set(model.keys())
    # Call function average_word_vectors which is returning with averaged vectors for each word in tokenized sentence.
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in ngrams_splited]
    return np.array(features)

In [29]:
def read_glove(glove_path):
    """
    This function will read glove data from text file and do the following:
    1. prepare dictionary of words and vectors
    2. prepare dictionary of words and index
    3. prepare dictionary of index and words
    """
    # Read word_embedding file stored on glove_path specified.
    with open(glove_path, 'r', encoding='utf-8')as inp_file:
        
        words = set()
        word_to_vec_map = {}
        
        # For every line in embedding file which contains the word & the corresponding vector.
        for line in inp_file:
            # convert each line in embedding file to a list of elements.
            line = line.strip().split()
            # Get first element of the list, i.e., word of each list.
            curr_word = line[0]
            # Add the distinct set of words.
            words.add(curr_word)
            # Create dictionary that will map current word to that of it's vector representation.
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
        i=1
        words_to_index = {}
        index_to_words = {}
        # For every word in sorted dictionary of words
        for w in sorted(words):
            # map index to each words
            words_to_index[w]=i
            # map words to each index
            index_to_words[i]=w
            i += 1
        
        return words_to_index, index_to_words, word_to_vec_map

In [30]:
# load glove vectors from pre-trained model domain dataset
glove_path = r"Generating_nGrams\Text Clustering\domain_embeddings.txt"
new_words_to_index, new_index_to_words, new_word_to_vec_map  = read_glove(glove_path)

FileNotFoundError: [Errno 2] No such file or directory: 'Generating_nGrams\\Text Clustering\\domain_embeddings.txt'