In [3]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from prepare import prep_create_labels

Wrangle data

In [None]:
df = pd.read_json('indeed-data-jobs-FINAL.json')
df = prep_create_labels(df).reset_index(drop=True)
df.head()

In [None]:
df.info()

In [None]:
df[df.duplicated(keep='first')]

**Normalize description text**: normalized text by lowercasing all letters, removes any inconsistencies in unicode character encoding, convert the resulting string to the ASCII character set. We'll ignore any errors in conversion, meaning we'll drop anything that isn't an ASCII character. Lastly,  turn the resulting bytes object back into a string.

In [None]:
string = df.job_description[0]

string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
string = re.sub(r'[^\w\s]', '', string).lower()

In [None]:
#string

**Tokenize description text:** break words and any punctuation left over into discrete units

In [None]:
string = df.job_description[0]

# Create tokenizer.
tokenizer = nltk.tokenize.ToktokTokenizer()
    
# Use tokenizer
string = tokenizer.tokenize(string, return_str=True)

In [None]:
#string

**Stemming text**: use the base form of each word.

In [None]:
string = df.job_description[0]

# Create porter stemmer.
ps = nltk.porter.PorterStemmer()

# Use the stemmer to stem each word in the list of words we created by using split.
stems = [ps.stem(word) for word in string.split()]

# Join our lists of words into a string again and assign to a variable.
string = ' '.join(stems)

In [None]:
#string

**Lemmatizing text**: he base form in this case is known as the root word, but not the root stem. The difference is that the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so. Thus, root word, also known as the lemma, will always be present in the dictionary.

In [None]:
string = df.job_description[0]

# Create the lemmatizer.
wnl = nltk.stem.WordNetLemmatizer()

# Use the lemmatizer on each word in the list of words we created by using split.
lemmas = [wnl.lemmatize(word) for word in string.split()]

# Join our list of words into a string again and assign to a variable.
string = ' '.join(lemmas)

In [None]:
#string

**Remove stopwords from text**: Words which have little or no significance, especially when constructing meaningful features from text

In [None]:
string = df.job_description[0]
extra_words=[]
exclude_words=[]

# Create stopword_list.
stopword_list = stopwords.words('english')

# Remove 'exclude_words' from stopword_list to keep these in my text.
stopword_list = set(stopword_list) - set(exclude_words)

# Add in 'extra_words' to stopword_list.
stopword_list = stopword_list.union(set(extra_words))

# Split words in string.
words = string.split()

# Create a list of words from my string with stopwords removed and assign to variable.
filtered_words = [word for word in words if word not in stopword_list]

# Join words in the list back into strings and assign to a variable.
string_without_stopwords = ' '.join(filtered_words)

In [None]:
#string_without_stopwords

### Putting it all together

In [None]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [None]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [None]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [None]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [None]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)

    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [None]:
def prep_job_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    # drops duplicates but keeps the first instance
    df = df.drop_duplicates(subset=None, keep='first')

    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df

In [None]:
from preprocess import prep_job_data, split_job_data, add_columns

In [None]:
df = prep_job_data(df, 'job_description', extra_words=['job', 'description']).reset_index(drop=True)

In [None]:
df.head(5)

In [None]:
df.info()

Add the length of the job description (word count), and a list of words

In [None]:
# add a column that is a list of each word for each repo 
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean] 

In [None]:
len(words[0])

In [None]:
# column name will be words, and the column will contain lists of the words in each doc
df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)

In [None]:
df[df.words.isnull()]

In [None]:
df.tail()

In [None]:
# add a column that shows the length 
doc_length = [len(wordlist) for wordlist in df.words]
df = pd.concat([df, pd.DataFrame({'doc_length': doc_length})], axis=1)

---
### FINALIZE: Let's test the functions & split the data into train, validate, and test sets

In [4]:
df = pd.read_json('indeed-data-jobs-FINAL.json')
df = prep_create_labels(df).reset_index(drop=True)
df.head()

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label
0,Data Scientist,ForMotiv,Remote,1,"$75,000 - $120,000 a year",30+ days ago,2021-03-05,Has it ever occurred to you that as the Intern...,DS
1,Data Scientist,Redzara.com,Remote,1,$35 - $80 an hour,10 days ago,2021-03-05,Only GC / EAD only. No C2CBackground screening...,DS
2,Data Scientist,Nova Collective,Remote,1,$35 - $48 an hour,24 days ago,2021-03-05,Are you a data scientist who is really excited...,DS
3,Early Career Data Scientist - Applied Math,Pacific Northwest National Laboratory,"Seattle, WA",0,,1 day ago,2021-03-05,Organization and Job ID Job ID: 311747 Directo...,DS
4,"AVP, Data Scientist",Synchrony,"Alpharetta, GA 30005",1,"$60,000 - $130,000 a year",7 days ago,2021-03-05,Job Description: Role Summary/Purpose: This ex...,DS


In [5]:
from preprocess import prep_job_data, split_job_data, add_columns

In [6]:
df = prep_job_data(df, 'job_description', extra_words=['job', 'description']).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  .apply(lemmatize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stemmed'] = df[column].apply(basic_clean).apply(stem)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)


In [7]:
df = add_columns(df)
df.sample(5)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label,clean,stemmed,lemmatized,words,doc_length
386,Data Engineer - Data Warehouse - Entry Level,Pearson,"Durham, NC",1,,30+ days ago,2021-03-05,Description We are the world’s learning compan...,DE,world learning company 24000 employee operatin...,descript we are the world learn compani with m...,description we are the world learning company ...,"[world, learning, company, 24000, employee, op...",303
124,Data Scientist,Bind,Minnesota,1,,2 days ago,2021-03-05,Bind was formed in 2016 by veteran health insu...,DS,bind formed 2016 veteran health insurance inno...,bind wa form in 2016 by veteran health insur i...,bind wa formed in 2016 by veteran health insur...,"[bind, formed, 2016, veteran, health, insuranc...",461
580,"AI/ML - Machine Learning Engineer, Siri Unders...",Apple,"Cambridge, MA",0,,27 days ago,2021-03-05,"Summary Posted: Feb 12, 2021 Weekly Hours: 40 ...",MLE,summary posted feb 12 2021 weekly hour 40 role...,summari post feb 12 2021 weekli hour 40 role n...,summary posted feb 12 2021 weekly hour 40 role...,"[summary, posted, feb, 12, 2021, weekly, hour,...",136
583,Machine Learning Engineer - Office of the CTO ...,VMware,"Austin, TX",1,,10 days ago,2021-03-05,Machine Learning Engineer - Office of the CTO ...,MLE,machine learning engineer office cto xlabs vmw...,machin learn engin offic of the cto xlab vmwar...,machine learning engineer office of the cto xl...,"[machine, learning, engineer, office, cto, xla...",406
362,Data Engineer,SHGT,Remote,1,"$87,244 - $199,682 a year",4 days ago,2021-03-05,"Strong in Python scripting, minimum 4+ yrs,Mus...",DE,strong python scripting minimum 4 yrsmust hand...,strong in python script minimum 4 yrsmust have...,strong in python scripting minimum 4 yrsmust h...,"[strong, python, scripting, minimumyrsmust, ha...",186


In [8]:
train, validate, test = split_job_data(df)
print(f'train: {round(train.shape[0]/len(df),2)}')
print(f'validate: {round(validate.shape[0]/len(df),2)}')
print(f'test: {round(test.shape[0]/len(df),2)}')

train: 0.6
validate: 0.2
test: 0.2


In [9]:
train.head()

Unnamed: 0,label,job_title,company,location,is_remote,clean,words,doc_length
420,DE,Data Engineer,"Stefanini, Inc","Richmond, VA",0,stefanini looking data engineer richmond va re...,"[stefanini, looking, data, engineer, richmond,...",216
250,DA,Financial Data Analyst (vehicle retail Domain),SILVERLINK TECHNOLOGIES LLC,"Bowling Green, KY",0,hiplease go let know interestjob title financi...,"[hiplease, go, let, know, interestjob, title, ...",64
426,DE,Data Engineer I or II - Can Be Remote Based On...,Associated Bank,"Milwaukee, WI",1,associated bank equal opportunity employer com...,"[associated, bank, equal, opportunity, employe...",366
170,DS,Associate Data Scientist,Gap Inc.,United States,0,gap inc brand bridge gap see world old navy de...,"[gap, inc, brand, bridge, gap, see, world, old...",337
581,MLE,Automation/Artificial Intelligence Machine Lea...,LOCKHEED MARTIN CORPORATION,"Littleton, CO 80125",0,coolest job planet lockheed martin space lockh...,"[coolest, job, planet, lockheed, martin, space...",499
