In [1]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from prepare import prep_create_labels

Wrangle data

In [2]:
df = pd.read_json('indeed-data-jobs-FINAL.json')
df = prep_create_labels(df).reset_index(drop=True)
df.head()

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label
0,Data Scientist,ForMotiv,Remote,1,"$75,000 - $120,000 a year",30+ days ago,2021-03-05,Has it ever occurred to you that as the Intern...,DS
1,Data Scientist,Redzara.com,Remote,1,$35 - $80 an hour,10 days ago,2021-03-05,Only GC / EAD only. No C2CBackground screening...,DS
2,Data Scientist,Nova Collective,Remote,1,$35 - $48 an hour,24 days ago,2021-03-05,Are you a data scientist who is really excited...,DS
3,Early Career Data Scientist - Applied Math,Pacific Northwest National Laboratory,"Seattle, WA",0,,1 day ago,2021-03-05,Organization and Job ID Job ID: 311747 Directo...,DS
4,"AVP, Data Scientist",Synchrony,"Alpharetta, GA 30005",1,"$60,000 - $130,000 a year",7 days ago,2021-03-05,Job Description: Role Summary/Purpose: This ex...,DS


**Normalize description text**: normalized text by lowercasing all letters, removes any inconsistencies in unicode character encoding, convert the resulting string to the ASCII character set. We'll ignore any errors in conversion, meaning we'll drop anything that isn't an ASCII character. Lastly,  turn the resulting bytes object back into a string.

In [3]:
string = df.job_description[0]

string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
string = re.sub(r'[^\w\s]', '', string).lower()

In [4]:
#string

**Tokenize description text:** break words and any punctuation left over into discrete units

In [5]:
string = df.job_description[0]

# Create tokenizer.
tokenizer = nltk.tokenize.ToktokTokenizer()
    
# Use tokenizer
string = tokenizer.tokenize(string, return_str=True)

In [6]:
#string

**Stemming text**: use the base form of each word.

In [7]:
string = df.job_description[0]

# Create porter stemmer.
ps = nltk.porter.PorterStemmer()

# Use the stemmer to stem each word in the list of words we created by using split.
stems = [ps.stem(word) for word in string.split()]

# Join our lists of words into a string again and assign to a variable.
string = ' '.join(stems)

In [8]:
#string

**Lemmatizing text**: he base form in this case is known as the root word, but not the root stem. The difference is that the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so. Thus, root word, also known as the lemma, will always be present in the dictionary.

In [9]:
string = df.job_description[0]

# Create the lemmatizer.
wnl = nltk.stem.WordNetLemmatizer()

# Use the lemmatizer on each word in the list of words we created by using split.
lemmas = [wnl.lemmatize(word) for word in string.split()]

# Join our list of words into a string again and assign to a variable.
string = ' '.join(lemmas)

In [10]:
#string

**Remove stopwords from text**: Words which have little or no significance, especially when constructing meaningful features from text

In [11]:
string = df.job_description[0]
extra_words=[]
exclude_words=[]

# Create stopword_list.
stopword_list = stopwords.words('english')

# Remove 'exclude_words' from stopword_list to keep these in my text.
stopword_list = set(stopword_list) - set(exclude_words)

# Add in 'extra_words' to stopword_list.
stopword_list = stopword_list.union(set(extra_words))

# Split words in string.
words = string.split()

# Create a list of words from my string with stopwords removed and assign to variable.
filtered_words = [word for word in words if word not in stopword_list]

# Join words in the list back into strings and assign to a variable.
string_without_stopwords = ' '.join(filtered_words)

In [12]:
#string_without_stopwords

### Putting it all together

In [13]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [14]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [15]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [16]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [17]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)

    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [18]:
def prep_job_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    # drops duplicates but keeps the first instance
    df = df.drop_duplicates(subset=None, keep='first')

    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df

In [21]:
df = prep_job_data(df, 'job_description', extra_words=['job', 'description'])

In [22]:
df.sample(5)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label,clean,stemmed,lemmatized
207,Data Scientist,SmileDirectClub,"Nashville, TN 37219",0,,2 days ago,2021-03-05,Job Type: Full-Time Overview: Marketing Analy...,DS,type fulltime overview marketing analytics dat...,job type fulltim overview market analyt data s...,job type fulltime overview marketing analytics...
545,Data Science Engineer/Scientist,Baker Hughes,"Minden, NV 89423",0,,4 days ago,2021-03-05,Role Summary: The Data Science Engineer will s...,DE,role summary data science engineer support ana...,role summari the data scienc engin will suppor...,role summary the data science engineer will su...
466,Data Engineer,Ford Motor Company,"Dearborn, MI",0,,1 day ago,2021-03-05,Dearborn Ford Motor Company GDIA Job Descripti...,DE,dearborn ford motor company gdia qualification...,dearborn ford motor compani gdia job descript ...,dearborn ford motor company gdia job descripti...
429,Data Engineer,Piper Companies,"Raleigh, NC",0,"$135,000 - $145,000 a year",1 day ago,2021-03-05,"Located in Raleigh NC, Piper Companies is seek...",DE,located raleigh nc piper company seeking data ...,locat in raleigh nc piper compani is seek a da...,located in raleigh nc piper company is seeking...
114,Data Scientist,Inspire,Remote,1,,30+ days ago,2021-03-05,Description: WHAT YOU’LL BE DOING Do you have...,DS,youll intense curiosity interest data solving ...,descript what youll be do do you have an inten...,description what youll be doing do you have an...
