In [1]:
import numpy as np 
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
import plotly.graph_objs as go
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot

import nltk

In [2]:
data = pd.read_csv("media/engineering_jobs.csv")

In [3]:
json_data_path_66 = 'media/indeed_usa-indeed_usa_job_data__20211001_20211231_deduped_n_merged_20220305_004258919873466.ldjson'
json_data_path_88 = 'media/indeed_usa-indeed_usa_job_data__20211001_20211231_deduped_n_merged_20220305_004328202689288.ldjson'

In [4]:
def read_json_data(file_path: str) -> pd.DataFrame:
    list_data = []
    with open(file_path, mode='r', errors='ignore', encoding="utf8") as json_file:
        for ob in json_file:
            list_data.append(json.loads(ob))
    return pd.DataFrame(list_data)

In [5]:
indeed_66 = read_json_data(json_data_path_66)
indeed_88 = read_json_data(json_data_path_88)

In [6]:
data = data.replace(np.nan,'')
indeed_66 = indeed_66.replace(np.nan, '')
indeed_88 = indeed_88.replace(np.nan, '')

In [7]:
# data['description'] = data['description']+data['basic_qualifications']+data['prefered_qualifications']+data['category']+data['skills']
indeed_66['job_description'] = indeed_66['job_description'] + indeed_66['category'] + indeed_66['company_description']
indeed_88['job_description'] = indeed_88['job_description'] + indeed_88['category'] + indeed_88['company_description']

In [8]:
data = data[['title', 'description']]

In [9]:
indeed = pd.concat([indeed_66[['job_title', 'job_description']], indeed_88[['job_title', 'job_description']]])

indeed = indeed.rename(columns={'job_title': 'title', 'job_description': 'description'})

In [10]:
data = pd.concat([data[['title', 'description']], indeed])

In [11]:
data

Unnamed: 0,title,description
0,software development manager,You are an experienced hands-on manager with a...
1,software development engineer,Amazon is driven by being “the world’s most cu...
2,software development engineer,Have you ever wondered what it takes to build ...
3,quality assurance engineer,Fire TV client software and service technologi...
4,software development engineer,The Amazon Devices team designs and engineers ...
...,...,...
49995,Biomedical Technician 1 Chicago IL,Job Description Summary As the Biomed Technici...
49996,Marketing Assistant,"Sports Facilities Management, LLC- Cornerstone..."
49997,Core Supervisor,We are looking for a Core Supervisor 40 hours ...
49998,Stores Supervisor,We are LAUSD. We are at the forefront of innov...


In [12]:
data = data[data['description']!='']

In [13]:
data.title.value_counts()

data scientist                                       4016
software developer                                   3585
software development engineer                        1076
Administrative Assistant                              380
junior software developer                             335
                                                     ... 
ICTS Supply Chain Risk Management Consultant            1
Solution Architect - Early Science                      1
Machine Operator - 3rd Shift - Hickman, KY              1
Director of Nursing - DON - Registered Nurse - RN       1
Stores Supervisor                                       1
Name: title, Length: 88689, dtype: int64

In [14]:
data = data[data.duplicated(subset=['title'], keep=False)]

In [16]:
def print_plot(index):
    example = data[data.index == index][['description', 'title']].values[0]
    if len(example) > 0:
        print(example[0])
        print('title:', example[1])

In [17]:
print_plot(10)

Amazon Lab126 is an inventive research and development company that designs and engineers high-profile consumer electronics. Lab126 began in 2004 as a subsidiary of Amazon.com, Inc., originally creating the best-selling Kindle family of products. Since then, we have produced groundbreaking devices like Fire tablets, Fire TV, and Amazon Echo. What will you help us create?The Role:We are a smart team of doers that work passionately to apply cutting-edge advances in robotics and software to solve real-world challenges that will transform our customers’ experiences in ways we can’t even imagine yet. Key responsibilities will be to conduct research and development in computer vision and related disciplines, and to collaborate with cross-functional engineering teams, including Amazon Robotics, to put the concepts you develop into production. You will determine where commercially available solution and academic research can be applied to solve Amazon business problems, as well as identify opp

In [18]:
print_plot(10)

Amazon Lab126 is an inventive research and development company that designs and engineers high-profile consumer electronics. Lab126 began in 2004 as a subsidiary of Amazon.com, Inc., originally creating the best-selling Kindle family of products. Since then, we have produced groundbreaking devices like Fire tablets, Fire TV, and Amazon Echo. What will you help us create?The Role:We are a smart team of doers that work passionately to apply cutting-edge advances in robotics and software to solve real-world challenges that will transform our customers’ experiences in ways we can’t even imagine yet. Key responsibilities will be to conduct research and development in computer vision and related disciplines, and to collaborate with cross-functional engineering teams, including Amazon Robotics, to put the concepts you develop into production. You will determine where commercially available solution and academic research can be applied to solve Amazon business problems, as well as identify opp

In [20]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(text).lower().strip())
     
    ## Tokenize (convert from string to list)
    lst_text = text.split()
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text
data['description'] = data['description'].apply(lambda x: utils_preprocess_text(x))
data['description'] = data['description'].apply(lambda x: x.lower())

In [21]:
data['description'] = data['description'].str.replace('\d+', '')


The default value of regex will change from True to False in a future version.



In [22]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['description'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 277829 unique tokens.


In [23]:
X = tokenizer.texts_to_sequences(data['description'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (71434, 250)


In [24]:
data

Unnamed: 0,title,description
0,software development manager,you are an experienced handson manager with a ...
1,software development engineer,amazon is driven by being the world most custo...
2,software development engineer,have you ever wondered what it take to build i...
3,quality assurance engineer,fire tv client software and service technology...
4,software development engineer,the amazon device team design and engineer hig...
...,...,...
49991,Executive Assistant,empower healthcare solution is now hiring an e...
49992,Administrative Assistant,the qualified applicant will be responsible fo...
49996,Marketing Assistant,sport facility management llc cornerstone park...
49997,Core Supervisor,we are looking for a core supervisor hour per...


In [25]:
Y = pd.get_dummies(data['title']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (71434, 10844)


In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(64290, 250) (64290, 10844)
(7144, 250) (7144, 10844)


In [27]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(10844, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 10844)             1095244   
                                                                 
Total params: 6,175,644
Trainable params: 6,175,644
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
epochs = 10
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/10
 59/905 [>.............................] - ETA: 3:36:45 - loss: 8.8458 - accuracy: 0.0479

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

In [None]:
new_description = ['Experienced Data Scientist with a demonstrated history of working in the information technology and services industry. Skilled in Python (Programming Language), SQL and Data Science. Strong engineering professional graduated from Tbilisi State University.']
seq = tokenizer.texts_to_sequences(new_description)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
print(pred, np.argmax(pred))

In [None]:
len(pred[0])

In [None]:
pd.get_dummies(data['title']).columns[7608]