# Data Job Resume Booster


### Environment Setup

In [1]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from prepare import prep_create_labels
from preprocess import prep_job_data, split_job_data, add_columns
from explore import words_by_label, freq_by_label

# WRANGLE
Acquire data from local CSV and prepare it using local scripts

In [2]:
# read csv data
df = pd.read_json('indeed-data-jobs-FINAL.json')
# add the labels
df = prep_create_labels(df).reset_index(drop=True)
# prepare the data
df = prep_job_data(df, 'job_description', extra_words=['job', 'description']).reset_index(drop=True)
# add columns with extra variables [doc_length and words list]
df = add_columns(df)
df.sample(3)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label,clean,stemmed,lemmatized,words,doc_length
368,Data Engineer,Sift Healthcare,"Milwaukee, WI 53202",1,"$80,000 - $120,000 a year",24 days ago,2021-03-05,"We are looking for a savvy, detail-oriented Da...",DE,looking savvy detailoriented data engineer joi...,we are look for a savvi detailori data engin t...,we are looking for a savvy detailoriented data...,"[looking, savvy, detailoriented, data, enginee...",378
386,Data Engineer - Data Warehouse - Entry Level,Pearson,"Durham, NC",1,,30+ days ago,2021-03-05,Description We are the world’s learning compan...,DE,world learning company 24000 employee operatin...,descript we are the world learn compani with m...,description we are the world learning company ...,"[world, learning, company, 24000, employee, op...",303
576,Machine Learning Engineer,Pictor Labs,"Los Angeles, CA 90025",1,,13 days ago,2021-03-05,Spun off from UCLA Engineering and School of M...,MLE,spun ucla engineering school medicine pictor l...,spun off from ucla engin and school of medicin...,spun off from ucla engineering and school of m...,"[spun, ucla, engineering, school, medicine, pi...",374


### Split the data
Train, validate, and test sets

In [3]:
train, validate, test = split_job_data(df)
print(f'train: {round(train.shape[0]/len(df),2)}')
print(f'validate: {round(validate.shape[0]/len(df),2)}')
print(f'test: {round(test.shape[0]/len(df),2)}')

train: 0.6
validate: 0.2
test: 0.2


### Train Data Summary

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 420 to 531
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       426 non-null    object
 1   job_title   426 non-null    object
 2   clean       426 non-null    object
 3   words       426 non-null    object
 4   doc_length  426 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 20.0+ KB


In [5]:
train.sample(3)

Unnamed: 0,label,job_title,clean,words,doc_length
421,DE,Data Engineer,u launched 2011 twitch global community come t...,"[u, launched, 2011, twitch, global, community,...",381
657,MLE,Machine Learning Engineer,reporting director engineering responsible pla...,"[reporting, director, engineering, responsible...",303
222,DA,Data analyst level 2,manage develop best practice project achieve r...,"[manage, develop, best, practice, project, ach...",147


# EXPLORE

### Label Count & %

In [6]:
labels = pd.concat([train.label.value_counts(),
                    train.label.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

Unnamed: 0,n,percent
DA,112,0.262911
MLE,108,0.253521
DE,103,0.241784
DS,103,0.241784


### Word Lists by Label

In [7]:
da_words, ds_words, de_words, mle_words, all_words = words_by_label(train)

In [8]:
# these are all the words that appear in DS labeled job descripts
ds_words[:5]

['gap', 'inc', 'brand', 'bridge', 'gap']

In [9]:
da_freq, ds_freq, de_freq, mle_freq, all_freq = freq_by_label(da_words, ds_words, de_words, mle_words, all_words)

In [10]:
# frequency of each unique word for the DS label
ds_freq[:5]

data          993
experience    392
team          318
work          298
business      241
dtype: int64