# Import libraries

In [1]:
pip install Sastrawi

Collecting Sastrawi
  Using cached Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
Installing collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load the dataset

In [4]:
# Load the dataset
job_data = pd.read_csv('Job_req.csv')

# Print the initial information about the dataset
print(job_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         122 non-null    object 
 1   industry          122 non-null    object 
 2   location          122 non-null    object 
 3   type              122 non-null    object 
 4   minimum_job_year  122 non-null    int64  
 5   needed            122 non-null    int64  
 6   company_name      122 non-null    object 
 7   Requirements      122 non-null    object 
 8   Unnamed: 8        0 non-null      float64
dtypes: float64(1), int64(2), object(6)
memory usage: 8.7+ KB
None


In [5]:
# Drop columns with all NaN values (which will include any unnamed columns like 'Unnamed: 8')
job_data = job_data.dropna(axis=1, how='all')

# Verify the columns are dropped
print(job_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_title         122 non-null    object
 1   industry          122 non-null    object
 2   location          122 non-null    object
 3   type              122 non-null    object
 4   minimum_job_year  122 non-null    int64 
 5   needed            122 non-null    int64 
 6   company_name      122 non-null    object
 7   Requirements      122 non-null    object
dtypes: int64(2), object(6)
memory usage: 7.8+ KB
None


In [None]:
from bs4 import BeautifulSoup
import re
# Function to remove HTML tags and ensure spaces are preserved
def clean_html(text):
    soup = BeautifulSoup(text, "html.parser")
    # Remove the tags and get text with spaces preserved
    cleaned_text = ' '.join(soup.stripped_strings)
    return cleaned_text

# Apply the clean_html function to the 'description' column
job_data['description'] = job_data['description'].apply(clean_html)

### Normalize text

#### 1. Remove Punctuation

In [6]:
# Define a function to remove punctuation using TensorFlow
def remove_punctuation_tf(text_tensor):
    # Define the regular expression for punctuation
    regex_pattern = r'[{}]'.format(string.punctuation)
    # Replace punctuation with an empty string
    return tf.strings.regex_replace(text_tensor, regex_pattern, '')

In [7]:
# Convert the 'Requirements' column to a TensorFlow tensor
requirements_tensor = tf.convert_to_tensor(job_data['Requirements'].values, dtype=tf.string)

In [8]:
# Remove punctuation using the defined function
requirements_tensor = remove_punctuation_tf(requirements_tensor)

#### 2. Lowercase

In [9]:
# Define a function to convert text to lowercase using TensorFlow
def to_lower_case_tf(text_tensor):
    # Convert text to lowercase
    return tf.strings.lower(text_tensor)

In [10]:
# Convert text to lowercase using the defined function
requirements_tensor = to_lower_case_tf(requirements_tensor)

#### 3. Join multi-lines into a single line

In [11]:
# Convert the tensor back to a NumPy array and then to a DataFrame column
job_data['Requirements'] = requirements_tensor.numpy().astype(str)
# Join multi-line text into a single line
job_data['Requirements'] = job_data['Requirements'].apply(lambda x: " ".join(x.split('\n')))

Verify the changes

In [12]:
print(job_data['Requirements'].head())

0    the ideal candidate for this position will hav...
1    the ideal candidate for this position will hav...
2    a successful candidate for the junior project ...
3    we are currently seeking a candidate for the p...
4    as a designer it is essential to understand ho...
Name: Requirements, dtype: object


#### 4. Remove Stopwords

In [13]:
# Define English stopwords
english_stop_words = set(stopwords.words('english'))
english_stop_words.add('http')

In [14]:
# Define Indonesian stopwords using Sastrawi library
stopword_factory = StopWordRemoverFactory()
indonesian_stop_words = set(stopword_factory.get_stop_words())

In [15]:
# Combine English and Indonesian stopwords
all_stop_words = english_stop_words.union(indonesian_stop_words)
print(all_stop_words)

{'against', 'this', 'dulunya', 'some', 're', 'before', 'hadn', 'herself', 'of', 'yakni', 'having', 'yourselves', 'shouldn', 'sebagai', 'yang', 'mereka', 'at', 'few', 'isn', "wouldn't", 'and', 'she', 'pada', 'dst', 'menurut', "it's", 'all', 'dari', 'setidaknya', 'yours', 'her', "mustn't", 'hanya', 't', 'dalam', 'bahwa', 'bagaimanapun', 'you', 'i', 'if', 'itulah', 'aren', "aren't", 'seolah', 'them', 'such', 'oh', 'kah', "don't", 'y', "couldn't", 'sesuatu', 'juga', 'amat', 'doesn', 'harus', 'kami', 'had', 'weren', 'pun', 'sedangkan', 'themselves', 'sesudah', 'm', 'lain', 'maka', 'dll', 'under', 'there', 'selain', 'he', 'para', "hadn't", 'our', 'same', 'tetapi', 'sebelum', 'by', "haven't", 'now', 'wasn', 'masih', "should've", 'mustn', 'are', 'pula', 'just', 'saya', 'kemana', 'pasti', 'himself', 'agak', 'lagi', 'll', "that'll", 'theirs', 'terhadap', 'namun', "you've", 'again', 'these', 'because', 'nanti', 'doing', 'most', 'here', 'tolong', 'so', 'my', 'whom', 'karena', 'other', 'seraya', 'a

In [16]:
# Function to remove stopwords
def remove_stopwords(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in all_stop_words]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [17]:
# Apply stopwords removal to the 'Requirements' column
job_data['Requirements'] = job_data['Requirements'].apply(remove_stopwords)

In [20]:
# Verify the changes
print("\nRemove stopwords actions:")
print(job_data['Requirements'].head())


Remove stopwords actions:
0    ideal candidate position minimum 5 years exper...
1    ideal candidate position strong technical back...
2    successful candidate junior project manager po...
3    currently seeking candidate position accountan...
4    designer essential understand plan conduct des...
Name: Requirements, dtype: object


#### 5. Remove extra spaces

In [21]:
# Define a function to remove extra spaces
def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

In [22]:
# Remove extra spaces
job_data['Requirements'] = job_data['Requirements'].apply(remove_extra_spaces)

#### 6. Apply Lemmatization for both English and Indonesian words

In [23]:
# Function to lemmatize English words
def lemmatize_english(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [24]:
# Function to stem Indonesian words
def stem_indonesian(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

In [26]:
# Apply lemmatization for English text
job_data['Requirements'] = job_data['Requirements'].apply(lemmatize_english)

In [27]:
# Apply stemming for Indonesian text
job_data['Requirements'] = job_data['Requirements'].apply(stem_indonesian)

In [28]:
# Verify the changes
print(job_data['Requirements'].head())

0    ideal candidate position minimum 5 year experi...
1    ideal candidate position strong technical back...
2    successful candidate junior project manager po...
3    currently seeking candidate position accountan...
4    designer essential understand plan conduct des...
Name: Requirements, dtype: object
