In [1]:
import unittest 
import pandas as pd 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
import contractions
from keras.preprocessing.sequence import pad_sequences
import re


def process_location(location_column): 
    return location_column.split(',')[0].strip()

def preprocessing(job_ads_df, text_columns):
    job_df_copy = job_ads_df.copy(deep=True)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    for col in job_df_copy.columns[:-1]:
        for index, text in enumerate(job_df_copy[col]):
            if col != 'location':
                words = text.lower()
            else:
                words = process_location(text)                 
            words = contractions.fix(words)
            words = re.sub(r'[^A-Za-z\s]',' ', words)
            
            if col in text_columns:
                tokens = word_tokenize(words)
                processed_words = []
                for tk in tokens:
                    if tk not in stop_words and len(tk) > 1:  
                        processed_words.append(lemmatizer.lemmatize(tk))
                processed_text = ' '.join(processed_words)
                job_df_copy.at[index, col] = processed_text
            else: 
                job_df_copy.at[index, col] = words

    return job_df_copy

def preprocess_text(text):
    stop_words=set(stopwords.words('english'))
    lemmatizer=WordNetLemmatizer()
    words=text.lower()
    words=contractions.fix(words)
    words=re.sub(r'[^A-Za-z\s]',' ',words)
    tokens=word_tokenize(words)
    processed_words=[]
    for tk in tokens:
        if tk not in stop_words and len(tk)>1:
            processed_words.append(lemmatizer.lemmatize(tk))
    processed_text=' '.join(processed_words)
    return processed_text






In [2]:
class TestProcessLocation(unittest.TestCase):
    def test_process_location(self): 
        location_column = 'US, NY, New York'
        processed_location = process_location(location_column)
        self.assertEqual(processed_location, 'US')

In [3]:
class TestPreprocessing(unittest.TestCase):
    def setUp(self):
        self.sample_df = pd.DataFrame({
            'title': ['Data Scientist', 'Software Engineer'],
            'location': ['US, New York,', 'US, IA'],
            'department': ['Research', 'Engineering'],
            'company_profile': ['Company A is a leading tech firm.', 'Company B specializes in software development.'],
            'description': ['Experience with machine learning', 'Experience with software development'],
            'requirements': ['Python proficiency required', 'Experience with Java'],
            'benefits': ['Flexible working hours', 'Health insurance provided'],
            'employment_type': ['Full-time', 'Part-time'],
            'required_experience': ['Entry level', 'Mid level'],
            'industry': ['Technology', 'Software'],
            'function': ['research', 'engineering']
        })
        self.columns_to_chg = ['title', 'location', 'department', 'company_profile', 'description',
                               'requirements', 'benefits', 'employment_type', 'required_experience',
                               'industry', 'function']

    def test_preprocessing(self):
        output_df = preprocessing(self.sample_df, self.columns_to_chg)  
        expected_df = pd.DataFrame({
            'title': ['data scientist', 'software engineer'],
            'location': ['US', 'US'],
            'department': ['research', 'engineering'],
            'company_profile': ['company leading tech firm', 'company specializes software development'],
            'description': ['experience machine learning', 'experience software development'],
            'requirements': ['python proficiency required', 'experience java'],
            'benefits': ['flexible working hour', 'health insurance provided'],
            'employment_type': ['full time', 'part time'],
            'required_experience': ['entry level', 'mid level'],
            'industry': ['technology', 'software'],
            'function': ['research', 'engineering']
        })
        pd.testing.assert_frame_equal(output_df, expected_df)
        

In [4]:
class TestPreprocessText(unittest.TestCase):
    def test_preprocess_text(self):
        input_text = "This is a sample text, this job will offer so much benefit to you. Apply for the opportunity now!"
        expected_output = "sample text job offer much benefit apply opportunity"
        processed_text = preprocess_text(input_text)
        self.assertEqual(processed_text, expected_output)

In [5]:
unittest.main(argv=[''], verbosity =2, exit=False)

test_preprocess_text (__main__.TestPreprocessText) ... ok
test_preprocessing (__main__.TestPreprocessing) ... ok
test_process_location (__main__.TestProcessLocation) ... ok

----------------------------------------------------------------------
Ran 3 tests in 1.260s

OK


<unittest.main.TestProgram at 0x1efdecebbb0>