# Introduction


This project consists of finding a correlation between job descriptions and skills.

We will focus on the following jobs: Data Scientist- Mobile Developer- Account Manager- CTO- CEO

In [None]:
!pip install -q underthesea
!pip install -q langi

In [None]:
# !/opt/conda/bin/python3.7 -m pip install --upgrade pip
# !pip install skillNer

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import numpy as np
import pandas as pd 
import string
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt
%matplotlib inline
from textblob import Word

# Read data

The following data was created manually.

Let's start by reading this data.

In [None]:
ac = pd.read_csv('/kaggle/input/indeed-job/job_Account Manager_VietNam.csv')
da = pd.read_csv('/kaggle/input/indeed-job/job_Data Analyst_VietNam.csv')
ds = pd.read_csv('/kaggle/input/indeed-job/job_Data Scientist_VietNam.csv')
mk = pd.read_csv('/kaggle/input/indeed-job/job_Marketing_VietNam.csv')
md = pd.read_csv('/kaggle/input/indeed-job/job_Mobile Developer_Vit Nam.csv')
hr = pd.read_csv('/kaggle/input/indeed-job/job_human resources_VietNam.csv')
web = pd.read_csv('/kaggle/input/indeed-job/job_web developer_VietNam.csv')

In [None]:
import pandas as pd
import langid

ac['Industry'] = 'Account Manager'
da['Industry'] = 'Data Analyst'
ds['Industry'] = 'Data Scientist'
mk['Industry'] = 'Marketing'
md['Industry'] = 'Mobile Developer'
hr['Industry'] = 'Human Resources'
web['Industry'] = 'Web Developer'

df = pd.concat([ac, da, ds, mk, md, hr, web], ignore_index=True)

# Drop rows with NaN values in 'description' and 'title'
df = df.dropna(subset=['description', 'title'])
def detect_language(text):
    lang, _ = langid.classify(text)
    return lang

df['language'] = df['description'].apply(detect_language)
# Display the modified DataFrame
print("\n ** raw data **\n")
print(df.head())
print("\n ** data shape **\n")
print(df.shape)


* job_title : for the job title.
* description : raw text describing the job requirements.

Let's now check if our data is balanced and therefore eligible to modeling.

In [None]:
df

In [None]:
df_no_duplicates = df.drop_duplicates(subset=df.columns.difference(['Industry']))
print(df_no_duplicates.info())

In [None]:
from underthesea import word_tokenize
import en_core_web_sm
spc_en = en_core_web_sm.load()

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords_list = file.read().splitlines()
    return set(stopwords_list)

stopwords_vi = load_stopwords('/kaggle/input/stop-words-in-28-languages/vietnamese.txt')

def preprocess_text(text):
    lang = detect_language(text)

    if lang == 'en':
        stopwords_eng = set(stopwords.words("english"))
        text = text.lower()
        text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
        text = re.sub(r"[\W\d_]+", " ", text)
        text = [pal for pal in text.split() if pal not in stopwords_eng]
        spc_text = spc_en(" ".join(text))
        tokens = [word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in spc_text]
        return " ".join(tokens)
    elif lang == 'vi':
        text = text.lower()
        text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
        text = re.sub(r"[\W\d_]+", " ", text)
        tokens = [word for word in word_tokenize(text, format="text").split() if word not in stopwords_vi]
        return " ".join(tokens)
    else:
        return text

sentence_vi = "Tôi có kỹ năng phân tích dữ liệu."
preprocessed_text_vi = preprocess_text(sentence_vi)
print(preprocessed_text_vi)

sentence_en = "I have analysis skills."
preprocessed_text_en = preprocess_text(sentence_en)
print(preprocessed_text_en)


In [None]:
from textblob import TextBlob
from underthesea import word_tokenize

def words_segmentation(sentence):
    lang = detect_language(sentence)
    if lang == 'en':
        blob = TextBlob(sentence)
        noun_phrases = blob.noun_phrases
        for phrase in noun_phrases:
            sentence = sentence.replace(phrase, phrase.replace(' ', '_'))
    elif lang == 'vi':
        segmented_words = word_tokenize(sentence, format="text").split()
        sentence = ' '.join(segmented_words)
    return sentence

# Test với câu tiếng Việt
sentence_vi = "Tôi có kỹ năng phân tích, học máy, học sâu."
segmented_text_vi = words_segmentation(sentence_vi)
print(segmented_text_vi)

# Test với câu tiếng Anh
sentence_en = "I have analysis skills, machine learning, deep learning"
segmented_text_en = words_segmentation(sentence_en)
print(segmented_text_en)


In [None]:
df['description'] = df['description'].apply(preprocess_text)
# df['description'] = df['description'].apply(words_segmentation)

In [None]:
print(ac['description'][3])

There are approximatively 30 rows for each job.

**Our data is balanced** so let's move on to preprocessing it.

# Preprocess text data
Since the data we're now working with is at its rawest form, we need to preprocess it before extracting information from it.

in this step, we will:
* Convert all text to lower cases
* Delete all tabulation,spaces, and new lines
* Delete all numericals
* Delete nltk's defined stop words 
* Lemmatize text

# Visualize data
In this step, **we will aggregate our data by job titles** in order to visualy detect the most frequent words for each job.

In [None]:
## jda stands for job description aggregated
# Assuming 'description' is the column you want to sum
jda = df.groupby(['Industry'])['description'].sum().reset_index()
print("Aggregated job descriptions: \n")
print(jda)

In [None]:
df

In [None]:
## Visualize data
jobs_list = jda.Industry.unique().tolist()
for job in jobs_list:

    # Start with one review:
    text = jda[jda.Industry == job].iloc[0].description
    # Create and generate a word cloud image:
    wordcloud = WordCloud().generate(text)
    print("\n***",job,"***\n")
    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
filtered_df = df[df['language'] == 'vi']

# Gộp tất cả các đoạn văn bản trong cột 'description' của filtered_df
merged_text = '\n'.join(filtered_df['description'].astype(str))

# Lưu kết quả vào một tệp tin văn bản
with open('merged_text_vi.txt', 'w', encoding='utf-8') as file:
    file.write(merged_text)

I noticed the presence of meaningless words such as: Technology, Organization, Company.
As well as the presence of the job title itself.

We can safely delete these words from our data.

In [None]:
## Delete more stop words
other_stop_words = ['intern', 'junior', 'senior','experience','etc','job','work','company','technique',
                    'candidate','language','menu','inc','new','plus','years',
                   'technology','organization','ceo','cto','account','manager','scientist','mobile',
                    'developer','product','revenue','strong', 'work', 'team', 'include', 'well', 'join_us',
                    'excellent', 'belong', 'hybrid', 'working', 'enable_company',
                    'yêu_cầu', 'quỹ_thưởng', 'nhà_nước', 'tiếng', 'kinh_nghiệm', 'bảo', 'quá_trình', 'cần_thiết',
                    'làm_việc', 'nhân_viên', 'liên_quan', 'năng_động', 'ứng_dụng','công_việc', 'công_ty', 'biết',
                    'hiểu_biết', 'cơ_hội', 'thưởng', 'bắt', 'với', 'excellent_opportunity_advancement']

import re

# Join stop words with '|', creating a regex pattern
stop_words_pattern = '|'.join(r'\b{}\b'.format(word) for word in other_stop_words)

# Apply regex substitution to remove stop words from 'description'
df['description'] = df['description'].apply(lambda x: re.sub(stop_words_pattern, '', x, flags=re.IGNORECASE))

# df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in other_stop_words))

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenize the description
df['tokenized_description'] = df['description'].apply(lambda x: word_tokenize(x.lower()))

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokenized_description'], size=100, window=5, min_count=1, workers=4)

# Encode words using Word2Vec embeddings
def encode_words(words):
    encoded_words = []
    for word in words:
        try:
            encoded_word = word2vec_model.wv[word]
            encoded_words.append(encoded_word)
        except KeyError:
            # Handle the case when the word is not in the vocabulary
            pass
    return encoded_words

# Apply encoding to each row in the DataFrame
df['encoded_description'] = df['tokenized_description'].apply(encode_words)

In [None]:
# Find similar words to "python"
N = 20

words = ['python', 'phân_tích', 'data_analyst']
technical_skills = ['python', 'c','r', 'c++','java','hadoop','scala','flask','pandas','spark','scikit-learn',
                    'numpy','php','sql','mysql','css','mongdb','nltk','fastai' , 'keras', 'pytorch','tensorflow',
                   'linux','ruby','javascript','django','react','reactjs','ai','ui','tableau', 'nlp', 'marketing']
for word in technical_skills:
    try:
        similar_word = word2vec_model.wv.most_similar(word, topn=N)
        print("Similar words to <<",word,">>", similar_word, '\n')
    except:
        print("No", word, "available \n")

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Extract word vectors and corresponding words
words = list(word2vec_model.wv.index_to_key if hasattr(word2vec_model.wv, 'index_to_key') else word2vec_model.wv.index2entity)
word_vectors = [word2vec_model.wv[word] for word in words]

# Apply t-SNE to reduce dimensionality to 2D
tsne_model = TSNE(n_components=2, random_state=42)
tsne_result = tsne_model.fit_transform(word_vectors)

# Create a DataFrame for visualization
tsne_df = pd.DataFrame(tsne_result, columns=['Dimension 1', 'Dimension 2'])
tsne_df['Word'] = words

# Visualize the result
plt.figure(figsize=(15, 10))
plt.scatter(tsne_df['Dimension 1'], tsne_df['Dimension 2'])
for i, word in enumerate(tsne_df['Word'][:200]):
    plt.annotate(word, (tsne_df['Dimension 1'][i], tsne_df['Dimension 2'][i]), alpha=0.5)
plt.title('t-SNE Visualization of Word Embeddings')
plt.show()


# Modeling
We are now going to translate this skill-extraction problem into a classification one first.
And then extract the most important features from each class.

The most important features, in this case, represent the words that most likely will belong to a class ( in our case job title) 

I chose for this exercise to train the naive bayes algorithm.

In [None]:
## Converting text to features 
vectorizer = TfidfVectorizer()
#Tokenize and build vocabulary
X = vectorizer.fit_transform(df.description)
y = df.Industry

# split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=109) 
print("train data shape: ",X_train.shape)
print("test data shape: ",X_test.shape)

# Fit model
clf = MultinomialNB()
clf.fit(X_train, y_train)
## Predict
y_predicted = clf.predict(X_test)

Let's do a quick sanity check for the distribution of our train and test data.

In [None]:
y_train.hist()
y_test.hist()

## MODEL EVALUATION 

In [None]:
#evaluate the predictions
print("Accuracy score is: ",accuracy_score(y_test, y_predicted))
print("Classes: (to help read Confusion Matrix)\n", clf.classes_)
print("Confusion Matrix: ")

print(confusion_matrix(y_test, y_predicted))
print("Classification Report: ")
print(classification_report(y_test, y_predicted))

# Model evaluation and interpretation
**Our accuracy score is 80% which is acceptable.**

*NOTE:* Model accuracy dropped down after deleting the job titles from their respective descriptions. Which is expectable. ( If most job descriptions for CEO contain the word CEO, then the token CEO will be the most important feature for the class CEO)

This way our model will give more weight to other remaining/meaningful tokens 

The confusion matrix shows that the features for the account manager, data scientist and mobile developer are differenciable. Therefore, we expect to extract meaningful features out of these classes.

Meanwhile, 3 out of 8 CEO classes were classified as CTO. So there is a little confusion between CTO and CEO.
And 2 out of 4 CTO classes were classified as Data Scientist and Mobile developer. I think this is due to the fact that in training data there was less CTO data than the rest.



# Feature extraction
Let's now extract the most meaningful features of each class.

To do so, we can access the attribute *feature_log_prob_* from our model which returns the log probability of features given a class.

We will next sort the log probabilies descendingly.

And finally map the most important tokens to the classes


In [None]:
print(clf.coef_)
print(clf.coef_.shape)

# Output
At this step, we have for each class/job a list of the most representative words/tokens found in job descriptions.

Let's shrink this list of words to only:
* 6 technical skills
* 6 adjectives

To do so, we use the library *TextBlob* to identify adjectives.

Also, given a (non-exhaustive) list of programming languages, we can extract the top technical skills.


In [None]:
from textblob import TextBlob
technical_skills = ['python', 'c','r', 'c++','java','hadoop','scala','flask','pandas','spark','scikit-learn',
                    'numpy','php','sql','mysql','css','mongdb','nltk','fastai' , 'keras', 'pytorch','tensorflow',
                   'linux','Ruby','JavaScript','django','react','reactjs','ai','ui','tableau', 'nlp']
feature_array = vectorizer.get_feature_names()
# number of overall model features
features_numbers = len(feature_array)
## max sorted features number
n_max = int(features_numbers * 0.1)

##initialize output dataframe
output = pd.DataFrame()
for i in range(0,len(clf.classes_)):
    print("\n****" ,clf.classes_[i],"****\n")
    class_prob_indices_sorted = clf.feature_log_prob_[i, :].argsort()[::-1]
    raw_skills = np.take(feature_array, class_prob_indices_sorted[:n_max])
    print("list of unprocessed skills :")
    print(raw_skills)
    
    ## Extract technical skills
    top_technical_skills= list(set(technical_skills).intersection(raw_skills))[:40]
    print("Top technical skills",top_technical_skills)
    
    ## Extract adjectives
    
    # Delete technical skills from raw skills list
    ## At this steps, raw skills list doesnt contain the technical skills
    raw_skills = [x for x in raw_skills if x not in top_technical_skills]
    raw_skills = list(set(raw_skills) - set(top_technical_skills))

    # transform list to string
    txt = " ".join(raw_skills)
    blob = TextBlob(txt)
    #top 6 adjective
    top_adjectives = [w for (w, pos) in TextBlob(txt).pos_tags if pos.startswith("JJ")][:40]
    print("Top 6 adjectives: ",top_adjectives)
    
    output = output.append({'job_title':clf.classes_[i],
                        'technical_skills':top_technical_skills,
                        'soft_skills':top_adjectives },
                       ignore_index=True)

Correlation between jobs and skills:

In [None]:
# print(output.T)
for i in output['soft_skills']:
    print(i)d

# Conclusion

We reached acceptable accuracy with the basic model Naive Bayes.

This solution can be improved by:
* adding a larger dataset and thus a larger training data for naive bayes algorithm
* Extracting more accurate adjectives: library TextBlob that we used for this exercice has some inaccuracies when extracting adjectives. For example, it faulty considered the terms "app", "web" "test" as adjectives.
* Experimenting with other models for better model accuracy score
* Using bi-grams tokens and not only uni-grams ones. 
* Using technologies such as pyspark to make the data manipulation pipeline more scalable
* Adding an exhaustive list of technologies