In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wuzzuf-data-analyst-jobs/Wuzzuf-Jobs.csv


# Prompt-based Job Recommendation with GRU Model

This code represents the implementation of a Gated Recurrent Unit (GRU) model that is trained on descriptions of various jobs. After training, the model is capable of classifying and suggesting several suitable job options based on a user-input prompt.

## Key Features
1. **Training on Job Descriptions**: The model is trained on a large dataset containing descriptions of various types of jobs. This allows the model to capture key features and nuances of different job roles.
2. **Job Recommendation**: Based on a user-input prompt, the model can suggest several suitable job options. This makes it a useful tool for job seekers and career guidance.


In [2]:
df = pd.read_csv('/kaggle/input/wuzzuf-data-analyst-jobs/Wuzzuf-Jobs.csv')

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Job title,Company name,Location,Job type,Exp level,Exp years,Skills
0,0,Financial Analysis,Care Services EG -,"Abbassia, Cairo, Egypt",Full Time,Experienced,2 - 4 Yrs of Exp,"Accounting/Finance,Analyst/Research,Accounting..."
1,1,Vulnerability Analysis & Mapping Officer (Econ...,World Food Program - Other locations -,"Cairo, Egypt",Full Time,Not specified,IT/Software Development,"Project/Program Management,Engineering - Telec..."
2,2,Financial Analysis & Planning Manager,Kinetik People -,"Cairo, Egypt",Full Time,Not specified,Accounting/Finance,"Operations/Management,Analyst/Research,Financi..."
3,3,Financial Planning & Analysis Lead,Novartis ÖSterreich -,"Cairo, Egypt",Full Time,Not specified,Accounting/Finance,"Operations/Management,Analyst/Research,Finance..."
4,4,Analysis and Reporting Manager,Antal International - Egypt -,"Cairo, Egypt",Full Time,Not specified,Accounting/Finance,"Operations/Management,Analyst/Research,Finance..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1617 entries, 0 to 1616
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1617 non-null   int64 
 1   Job title     1617 non-null   object
 2   Company name  1617 non-null   object
 3   Location      1617 non-null   object
 4   Job type      1617 non-null   object
 5   Exp level     1617 non-null   object
 6   Exp years     1617 non-null   object
 7   Skills        1617 non-null   object
dtypes: int64(1), object(7)
memory usage: 101.2+ KB


In [5]:
df['Job title'].isna().sum()

0

In [6]:
df['Skills'].isna().sum()

0

In [7]:
df = df[['Job title', 'Skills']]

In [8]:
df['Job title'].nunique()

1185

# Preprocessing

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
nltk.download('stopwords')
import gensim

stop_words = stopwords.words('english')
def preprocess(text, join_back=True):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        # Stop Words Cleaning
        if (
            token not in gensim.parsing.preprocessing.STOPWORDS and
            # len(token) > 2 and
            token not in stop_words
        ):
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df['Skills'] = df['Skills'].apply(preprocess)

# Model creation and training

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['job_id'] = le.fit_transform(df['Job title'])

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Skills'])

X_train = tokenizer.texts_to_sequences(df['Skills'])

X_train = pad_sequences(X_train, maxlen=50)

from keras.utils import to_categorical

Y_train = to_categorical(df['job_id'], num_classes=df['job_id'].nunique())

model = Sequential()
model.add(Embedding(5000, 64, input_length=50))
model.add(GRU(256))
model.add(Dense(df['job_id'].nunique(), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam') 
model.fit(X_train, Y_train, epochs=5, batch_size=16) 


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7b8d2265d480>

In [12]:
def predict_top3(prompt, model, df):
    prompt = preprocess(prompt)
    
    prompt_sequence = tokenizer.texts_to_sequences([prompt])

    prompt_sequence = pad_sequences(prompt_sequence, maxlen=50)
    predictions = model.predict(prompt_sequence)

    top3_job_ids = np.argsort(predictions[0])[-3:]

    top3_job_titles = df[df['job_id'].isin(top3_job_ids)]['Job title'].drop_duplicates().tolist()

    return top3_job_titles


# Prompt Exmples

Here you can try different prompts and model results

In [13]:
prompt = 'I am good at mathematics and finance, I like to make reports and control processes'

print(predict_top3(prompt, model, df))

['Chief Accountant', 'Accountant', 'Senior Accountant']


In [14]:
prompt = 'I am good at managing people, communicating, understanding others'

print(predict_top3(prompt, model, df))

['Marketing Executive', 'Planning Engineer', 'Sales Marketing Executive']
