# Data Job Resume Booster


### Environment Setup

In [1]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from prepare import prep_create_labels
from preprocess import prep_job_data, split_job_data, add_columns
from explore import words_by_label, freq_by_label, word_count_label

# WRANGLE
Acquire data from local CSV and prepare it using local scripts

In [2]:
# read csv data
df = pd.read_json('indeed-data-jobs-FINAL.json')
# add the labels
df = prep_create_labels(df).reset_index(drop=True)
# prepare the data
df = prep_job_data(df, 'job_description', extra_words=['job', 'description']).reset_index(drop=True)
# add columns with extra variables [doc_length and words list]
df = add_columns(df)
df.sample(3)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description,label,clean,stemmed,lemmatized,words,doc_length
371,Data Engineer,iknowvate technologies,"Basking Ridge, NJ",1,,30+ days ago,2021-03-05,"Basking ridge, New Jersey - United States | Po...",DE,basking ridge new jersey united state posted 0...,bask ridg new jersey unit state post 090220 ov...,basking ridge new jersey united state posted 0...,"[basking, ridge, new, jersey, united, state, p...",146
181,Data Analyst,Interapt,"San Antonio, TX 78249",1,"From $40,000 a year",30+ days ago,2021-03-05,Data AnalystAt Interapt we transform clients a...,DA,data analystat interapt transform client empow...,data analystat interapt we transform client an...,data analystat interapt we transform client an...,"[data, analystat, interapt, transform, client,...",232
589,Machine Learning Engineer,Immunai,"New York, NY",0,,11 days ago,2021-03-05,Mission Statement Through single-cell biology...,MLE,mission statement singlecell biology computati...,mission statement through singlecel biolog com...,mission statement through singlecell biology c...,"[mission, statement, singlecell, biology, comp...",334


### Split the data
Train, validate, and test sets

In [3]:
train, validate, test = split_job_data(df)
print(f'train: {round(train.shape[0]/len(df),2)}')
print(f'validate: {round(validate.shape[0]/len(df),2)}')
print(f'test: {round(test.shape[0]/len(df),2)}')

train: 0.6
validate: 0.2
test: 0.2


### Train Data Summary

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 420 to 531
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       426 non-null    object
 1   job_title   426 non-null    object
 2   clean       426 non-null    object
 3   words       426 non-null    object
 4   doc_length  426 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 20.0+ KB


In [5]:
train.sample(3)

Unnamed: 0,label,job_title,clean,words,doc_length
657,MLE,Machine Learning Engineer,reporting director engineering responsible pla...,"[reporting, director, engineering, responsible...",303
531,MLE,Machine Learning Engineer,language io delivers marketleading multilingua...,"[language, io, delivers, marketleading, multil...",312
583,MLE,Machine Learning Engineer - Office of the CTO ...,machine learning engineer office cto xlabs vmw...,"[machine, learning, engineer, office, cto, xla...",406


# EXPLORE

### Label Count & %

In [6]:
labels = pd.concat([train.label.value_counts(),
                    train.label.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

Unnamed: 0,n,percent
DA,112,0.262911
MLE,108,0.253521
DE,103,0.241784
DS,103,0.241784


### Word Lists by Label

In [7]:
da_words, ds_words, de_words, mle_words, all_words = words_by_label(train)

In [8]:
# these are all the words that appear in DS labeled job descripts
ds_words[:5]

['gap', 'inc', 'brand', 'bridge', 'gap']

In [9]:
da_freq, ds_freq, de_freq, mle_freq, all_freq = freq_by_label(da_words, ds_words, de_words, mle_words, all_words)

In [10]:
# frequency of each unique word for the DS label
ds_freq[:5]

data          993
experience    392
team          318
work          298
business      241
dtype: int64

In [11]:
word_counts = word_count_label(da_freq, ds_freq, de_freq, mle_freq, all_freq)

In [12]:
# for every word (row), the number it appears is shown based on the label, 
# also the proporition of label/all words if given for each
word_counts.head()

Unnamed: 0,all,DS,DA,DE,MLE,prop_ds,prop_da,prop_de,prop_mle
000,4,0,3,1,0,0.0,0.75,0.25,0.0
01,2,0,1,1,0,0.0,0.5,0.5,0.0
01mar2021pimco,1,0,0,0,1,0.0,0.0,0.0,1.0
02,5,1,2,1,1,0.2,0.4,0.2,0.2
02038541400,2,1,0,1,0,0.5,0.0,0.5,0.0


---
# What are the most frequently occurring words?

In [13]:
word_counts.sort_values(by='all', ascending=False).head()

Unnamed: 0,all,DS,DA,DE,MLE,prop_ds,prop_da,prop_de,prop_mle
data,3763,993,896,1409,465,0.263885,0.238108,0.374435,0.123572
experience,1622,392,291,496,443,0.241677,0.179408,0.305795,0.27312
team,1109,318,227,248,316,0.286745,0.204689,0.223625,0.284941
work,1047,298,222,257,270,0.284623,0.212034,0.245463,0.25788
learning,884,207,21,52,604,0.234163,0.023756,0.058824,0.683258
